diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index 88df687bd6..8eede28705 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -38,7 +38,7 @@ jobs:
   CPU_MAC:
     runs-on: macos-latest
     env:
-      FC: gfortran-11
+      FC: gfortran-14 # see #971
     strategy:
       matrix:
         folder: [ epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum, epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg ]
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
index b90ef84b47..b64e42a22e 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
@@ -1,8 +1,8 @@
 diff --git b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
-index 4fbb8e6ba..f9e2335de 100644
+index 4fbb8e6ba..d5accb9fb 100644
 --- b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
 +++ a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
-@@ -484,23 +484,140 @@ C
+@@ -484,23 +484,142 @@ C
        INTEGER VECSIZE_USED
  
        INTEGER IVEC
@@ -40,7 +40,7 @@ index 4fbb8e6ba..f9e2335de 100644
 +      
 +      IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 +#endif
-+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
++        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
  !$OMP PARALLEL
  !$OMP DO
 -      DO IVEC=1, VECSIZE_USED
@@ -67,7 +67,7 @@ index 4fbb8e6ba..f9e2335de 100644
 +        ENDDO
  !$OMP END DO
  !$OMP END PARALLEL
-+        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
++        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 +#ifdef MG5AMC_MEEXPORTER_CUDACPP
 +      ENDIF
 +
@@ -77,9 +77,10 @@ index 4fbb8e6ba..f9e2335de 100644
 +          STOP
 +        ENDIF
 +        IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
++          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
 +          CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
 +     &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-+     &      SELECTED_HEL2, SELECTED_COL2 )
++     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
 +          FIRST = .FALSE.
 +c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
 +          IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -93,22 +94,23 @@ index 4fbb8e6ba..f9e2335de 100644
 +          ENDIF
 +          WRITE (6,*) 'NGOODHEL =', NGOODHEL
 +          WRITE (6,*) 'NCOMB =', NCOMB
++          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
 +        ENDIF
-+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
++        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
 +        IF ( .NOT. MULTI_CHANNEL ) THEN
 +          CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
 +     &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-+     &      SELECTED_HEL2, SELECTED_COL2 )
++     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
 +        ELSE
 +          IF( SDE_STRAT.NE.1 ) THEN
 +            WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
 +            STOP
 +          ENDIF
-+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
++          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
 +     &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-+     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
++     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
 +        ENDIF
-+        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
++        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
 +      ENDIF
 +
 +      IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
@@ -284,7 +286,7 @@ index 1124a9164..27a6e4674 100644
        open(unit=lun,file=tempname,status='old',ERR=20)
        fopened=.true.
 diff --git b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
-index e73e654d4..27fbe7302 100644
+index e73e654d4..3072054f2 100644
 --- b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
 +++ a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
 @@ -72,7 +72,10 @@ C
@@ -299,15 +301,7 @@ index e73e654d4..27fbe7302 100644
  C     
  C     This is just to temporarily store the reference grid for
  C      helicity of the DiscreteSampler so as to obtain its number of
-@@ -140,6 +143,7 @@ C     ----------
- C     BEGIN CODE
- C     ----------
- 
-+      call counters_smatrix1_start()
-       NTRY(IMIRROR)=NTRY(IMIRROR)+1
-       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
-       DO I=1,NEXTERNAL
-@@ -217,6 +221,17 @@ C     ----------
+@@ -217,6 +220,17 @@ C     ----------
            ENDIF
            IF(NTRY(IMIRROR).EQ.MAXTRIES)THEN
              ISHEL(IMIRROR)=MIN(ISUM_HEL,NGOOD(IMIRROR))
@@ -325,22 +319,3 @@ index e73e654d4..27fbe7302 100644
            ENDIF
          ENDIF
        ELSE IF (.NOT.INIT_MODE) THEN  ! random helicity 
-@@ -234,6 +249,7 @@ C       Include the Jacobian from helicity sampling
-         IHEL = HEL_PICKED
-       ELSE
-         ANS = 1D0
-+        call counters_smatrix1_stop()
-         RETURN
-       ENDIF
-       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
-@@ -278,9 +294,8 @@ C           Set right sign for ANS, based on sign of chosen helicity
-         ENDIF
-       ENDIF
-       ANS=ANS/DBLE(IDEN)
--
-       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
--
-+      call counters_smatrix1_stop()
-       END
- 
- 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
index 4bf2198dd1..60eb101a6a 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
@@ -109,9 +109,9 @@ namespace mg5amcCpu
      * @param rndcol the pointer to the input random numbers for color selection
      * @param channelId the Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0)
      * @param mes the pointer to the output matrix elements
-     * @param goodHelOnly quit after computing good helicities?
      * @param selhel the pointer to the output selected helicities
      * @param selcol the pointer to the output selected colors
+     * @param goodHelOnly quit after computing good helicities?
      */
     void gpu_sequence( const FORTRANFPTYPE* momenta,
                        const FORTRANFPTYPE* gs,
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/counters.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/counters.cc
index 742575a6a5..8ef58cce80 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/counters.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/counters.cc
@@ -21,26 +21,24 @@ extern "C"
 {
   // Now: fortran=-1, cudacpp=0
   // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
+  constexpr unsigned int nimplC = 3;
   constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
   const char* iimplC2TXT( int iimplC )
   {
     const int iimplF = iimplC - 1;
     switch( iimplF )
     {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
+      case -1: return "Fortran MEs"; break;
+      case +0: return "CudaCpp MEs"; break;
+      case +1: return "CudaCpp HEL"; break;
       default: assert( false ); break;
     }
   }
 
   static mgOnGpu::Timer<TIMERTYPE> program_timer;
   static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
   static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
   static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
   static int smatrix1multi_counter[nimplC] = { 0 };
 
   void counters_initialise_()
@@ -49,19 +47,6 @@ extern "C"
     return;
   }
 
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
   void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
   {
     const unsigned int iimplC = iimplF2C( *iimplF );
@@ -86,13 +71,23 @@ extern "C"
     printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
     printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
     for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
+    {
       if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+      {
+        if( iimplC < nimplC - 1 ) // MEs
+          printf( " [COUNTERS] %11s      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
+                  iimplC2TXT( iimplC ),
+                  iimplC + 1,
+                  smatrix1multi_totaltime[iimplC],
+                  smatrix1multi_counter[iimplC],
+                  smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+        else
+          printf( " [COUNTERS] %11s      ( %1d ) : %9.4fs\n",
+                  iimplC2TXT( iimplC ),
+                  iimplC + 1,
+                  smatrix1multi_totaltime[iimplC] );
+      }
+    }
     return;
   }
 }
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
index a052631aa9..78512a5eeb 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -116,15 +116,31 @@ override CUDA_HOME = $(patsubst %%/bin/nvcc,%%,$(shell which nvcc 2>/dev/null))
 # Set HIP_HOME from the path to hipcc, if it exists
 override HIP_HOME = $(patsubst %%/bin/hipcc,%%,$(shell which hipcc 2>/dev/null))
 
-# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists
-# (FIXME? Is there any equivalent of NVTX FOR HIP? What should be configured if both CUDA and HIP are installed?)
-ifneq ($(CUDA_HOME),)
-  USE_NVTX ?=-DUSE_NVTX
-  CUDA_INC = -I$(CUDA_HOME)/include/
+# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
+ifeq ($(CUDA_HOME),)
+  # CUDA_HOME is empty (nvcc not found)
+  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/),)
+  # CUDA_HOME is defined (nvcc was found) but $(CUDA_HOME)/include/ does not exist?
+  override CUDA_INC=
 else
+  CUDA_INC = -I$(CUDA_HOME)/include/
+endif
+###$(info CUDA_INC=$(CUDA_INC))
+
+# Configure NVTX if a CUDA include directory exists and NVTX headers exist (see #965)
+ifeq ($(CUDA_INC),)
+  # $(CUDA_HOME)/include/ does not exist
   override USE_NVTX=
-  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/nvtx3/nvToolsExt.h),)
+  # $(CUDA_HOME)/include/ exists but NVTX headers do not exist?
+  override USE_NVTX=
+else
+  # $(CUDA_HOME)/include/nvtx.h exists: use NVTX
+  # (NB: the option to disable NVTX if 'USE_NVTX=' is defined has been removed)
+  override USE_NVTX=-DUSE_NVTX
 endif
+###$(info USE_NVTX=$(USE_NVTX))
 
 # NB: NEW LOGIC FOR ENABLING AND DISABLING CUDA OR HIP BUILDS (AV Feb-Mar 2024)
 # - In the old implementation, by default the C++ targets for one specific AVX were always built together with either CUDA or HIP.
@@ -424,13 +440,18 @@ endif
 # (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...)
 ifeq ($(HASCURAND),)
   ifeq ($(GPUCC),) # CPU-only build
-    ifneq ($(CUDA_HOME),)
+    ifeq ($(CUDA_INC),)
+      # $(CUDA_HOME)/include/ does not exist (see #965)
+      override HASCURAND = hasNoCurand
+    else ifeq ($(wildcard $(CUDA_HOME)/include/curand.h),)
+      # $(CUDA_HOME)/include/ exists but CURAND headers do not exist? (see #965)
+      override HASCURAND = hasNoCurand
+    else
       # By default, assume that curand is installed if a CUDA installation exists
       override HASCURAND = hasCurand
-    else
-      override HASCURAND = hasNoCurand
     endif
   else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    # By default, assume that curand is installed if a CUDA build is requested
     override HASCURAND = hasCurand
   else # non-Nvidia GPU build
     override HASCURAND = hasNoCurand
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc
index 8a5b8be9c0..99efcb1dbe 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc
@@ -83,6 +83,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_( CppObjectInFortran** ppbridge,
                          const FORTRANFPTYPE* momenta,
@@ -92,18 +93,20 @@ extern "C"
                          const unsigned int* pchannelId,
                          FORTRANFPTYPE* mes,
                          int* selhel,
-                         int* selcol )
+                         int* selcol,
+                         const bool* pgoodHelOnly )
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
+    //printf("fbridgesequence_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
 #ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
-    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #else
     // Use the host/CPU implementation in the C++ library
     // (there is no device implementation in this library)
-    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #endif
   }
 
@@ -119,6 +122,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
                                         const FORTRANFPTYPE* momenta,
@@ -127,9 +131,11 @@ extern "C"
                                         const FORTRANFPTYPE* rndcol,
                                         FORTRANFPTYPE* mes,
                                         int* selhel,
-                                        int* selcol )
+                                        int* selcol,
+                                        const bool* pgoodHelOnly )
   {
-    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol );
+    //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
+    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly );
   }
 
   /**
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.inc
index 422aa67cf9..a28622cdb6 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.inc
@@ -40,10 +40,11 @@ C - CHANID:  the input Feynman diagram to enhance in multi-channel mode if 1 to
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -53,6 +54,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE
       END INTERFACE
 
@@ -66,10 +68,11 @@ C - RNDCOL:  the input random number Fortran array for color selection
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -78,6 +81,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL
       END INTERFACE
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fcheck_sa.f b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fcheck_sa.f
index 5bbeefbb58..fb942500a5 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fcheck_sa.f
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/CODEGEN/generateAndCompare.sh b/epochX/cudacpp/CODEGEN/generateAndCompare.sh
index 012cbdf6a0..ca7decaa37 100755
--- a/epochX/cudacpp/CODEGEN/generateAndCompare.sh
+++ b/epochX/cudacpp/CODEGEN/generateAndCompare.sh
@@ -331,6 +331,7 @@ function codeGenAndDiff()
             | awk -vdate="D:20240301000000+01'00'" '{print gensub("(^/ModDate\\().*(\\)>>endobj$)","\\1"date"\\2","g")}' \
             | awk -vdate="D:20240301000000+01'00'" '{print gensub("(^/CreationDate\\().*(\\)$)","\\1"date"\\2","g")}' \
             | awk -vid="0123456789abcdef0123456789abcdef" '{print gensub("(^/ID \\[<).*><.*(>\\]$)","\\1"id"><"id"\\2","g")}' \
+            | awk -vid="0123456789abcdef0123456789abcdef" '{print gensub("(^/ID \\[\\().*\\)\\(.*(\\)\\]$)","\\1"id")("id"\\2","g")}' \
             | awk -vdate="2024-03-01T00:00:00+01:00" '{print gensub("(<xmp:ModifyDate>).*(</xmp:ModifyDate>)","\\1"date"\\2","g")}' \
             | awk -vdate="2024-03-01T00:00:00+01:00" '{print gensub("(<xmp:CreateDate>).*(</xmp:CreateDate>)","\\1"date"\\2","g")}' \
             | awk -vuuid="'uuid=01234567-89ab-cdef-0123-456789abcdef'" '{print gensub("(xapMM:DocumentID=).*(/>$)","\\1"uuid"\\2","g")}' \
diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
index 5982c61ae8..f059e68f5e 100644
--- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005379676818847656 [0m
+[1;32mDEBUG: model prefixing  takes 0.005307912826538086 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -177,7 +177,7 @@ INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1
 INFO: Processing color information for process: e+ e- > mu+ mu- @1 
 INFO: Creating files in directory P1_epem_mupmum 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1152][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f630143f4c0> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f09ed66e490> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -198,18 +198,18 @@ INFO: Finding symmetric diagrams for subprocess group epem_mupmum
 [1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1544][0m [0m
 [1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1545][0m [0m
 Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
-Wrote files for 8 helas calls in 0.114 s
+Wrote files for 8 helas calls in 0.112 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
-ALOHA: aloha creates 3 routines in  0.201 s
+ALOHA: aloha creates 3 routines in  0.198 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 7 routines in  0.255 s
+ALOHA: aloha creates 7 routines in  0.253 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -252,9 +252,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.118s
-user	0m1.862s
-sys	0m0.242s
+real	0m2.067s
+user	0m1.807s
+sys	0m0.251s
 Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
index 4bf2198dd1..60eb101a6a 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
@@ -109,9 +109,9 @@ namespace mg5amcCpu
      * @param rndcol the pointer to the input random numbers for color selection
      * @param channelId the Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0)
      * @param mes the pointer to the output matrix elements
-     * @param goodHelOnly quit after computing good helicities?
      * @param selhel the pointer to the output selected helicities
      * @param selcol the pointer to the output selected colors
+     * @param goodHelOnly quit after computing good helicities?
      */
     void gpu_sequence( const FORTRANFPTYPE* momenta,
                        const FORTRANFPTYPE* gs,
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f
index 38978865ff..ef45890e25 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f
@@ -528,7 +528,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
       
       IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 #endif
-        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
 !$OMP PARALLEL
 !$OMP DO
         DO IVEC=1, VECSIZE_USED
@@ -544,7 +544,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
         ENDDO
 !$OMP END DO
 !$OMP END PARALLEL
-        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
+        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
       ENDIF
 
@@ -554,9 +554,10 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           STOP
         ENDIF
         IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
+          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
           FIRST = .FALSE.
 c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
           IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -570,22 +571,23 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           ENDIF
           WRITE (6,*) 'NGOODHEL =', NGOODHEL
           WRITE (6,*) 'NCOMB =', NCOMB
+          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
         ENDIF
-        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
         IF ( .NOT. MULTI_CHANNEL ) THEN
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ELSE
           IF( SDE_STRAT.NE.1 ) THEN
             WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
             STOP
           ENDIF
-          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
      &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ENDIF
-        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
+        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
       ENDIF
 
       IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/fcheck_sa.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/fcheck_sa.f
index 37d586be72..f0220047d7 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/fcheck_sa.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f
index f1b5fc0e1a..c2a8b78ed6 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f
@@ -143,7 +143,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 C     BEGIN CODE
 C     ----------
 
-      call counters_smatrix1_start()
       NTRY(IMIRROR)=NTRY(IMIRROR)+1
       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
       DO I=1,NEXTERNAL
@@ -267,7 +266,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         IHEL = HEL_PICKED
       ELSE
         ANS = 1D0
-        call counters_smatrix1_stop()
         RETURN
       ENDIF
       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
@@ -312,8 +310,9 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         ENDIF
       ENDIF
       ANS=ANS/DBLE(IDEN)
+
       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
-      call counters_smatrix1_stop()
+
       END
 
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/counters.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/counters.cc
index 742575a6a5..8ef58cce80 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/counters.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/counters.cc
@@ -21,26 +21,24 @@ extern "C"
 {
   // Now: fortran=-1, cudacpp=0
   // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
+  constexpr unsigned int nimplC = 3;
   constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
   const char* iimplC2TXT( int iimplC )
   {
     const int iimplF = iimplC - 1;
     switch( iimplF )
     {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
+      case -1: return "Fortran MEs"; break;
+      case +0: return "CudaCpp MEs"; break;
+      case +1: return "CudaCpp HEL"; break;
       default: assert( false ); break;
     }
   }
 
   static mgOnGpu::Timer<TIMERTYPE> program_timer;
   static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
   static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
   static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
   static int smatrix1multi_counter[nimplC] = { 0 };
 
   void counters_initialise_()
@@ -49,19 +47,6 @@ extern "C"
     return;
   }
 
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
   void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
   {
     const unsigned int iimplC = iimplF2C( *iimplF );
@@ -86,13 +71,23 @@ extern "C"
     printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
     printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
     for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
+    {
       if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+      {
+        if( iimplC < nimplC - 1 ) // MEs
+          printf( " [COUNTERS] %11s      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
+                  iimplC2TXT( iimplC ),
+                  iimplC + 1,
+                  smatrix1multi_totaltime[iimplC],
+                  smatrix1multi_counter[iimplC],
+                  smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+        else
+          printf( " [COUNTERS] %11s      ( %1d ) : %9.4fs\n",
+                  iimplC2TXT( iimplC ),
+                  iimplC + 1,
+                  smatrix1multi_totaltime[iimplC] );
+      }
+    }
     return;
   }
 }
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index 359f16c029..9cff5e1a60 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -116,15 +116,31 @@ override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 # Set HIP_HOME from the path to hipcc, if it exists
 override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
 
-# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists
-# (FIXME? Is there any equivalent of NVTX FOR HIP? What should be configured if both CUDA and HIP are installed?)
-ifneq ($(CUDA_HOME),)
-  USE_NVTX ?=-DUSE_NVTX
-  CUDA_INC = -I$(CUDA_HOME)/include/
+# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
+ifeq ($(CUDA_HOME),)
+  # CUDA_HOME is empty (nvcc not found)
+  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/),)
+  # CUDA_HOME is defined (nvcc was found) but $(CUDA_HOME)/include/ does not exist?
+  override CUDA_INC=
 else
+  CUDA_INC = -I$(CUDA_HOME)/include/
+endif
+###$(info CUDA_INC=$(CUDA_INC))
+
+# Configure NVTX if a CUDA include directory exists and NVTX headers exist (see #965)
+ifeq ($(CUDA_INC),)
+  # $(CUDA_HOME)/include/ does not exist
   override USE_NVTX=
-  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/nvtx3/nvToolsExt.h),)
+  # $(CUDA_HOME)/include/ exists but NVTX headers do not exist?
+  override USE_NVTX=
+else
+  # $(CUDA_HOME)/include/nvtx.h exists: use NVTX
+  # (NB: the option to disable NVTX if 'USE_NVTX=' is defined has been removed)
+  override USE_NVTX=-DUSE_NVTX
 endif
+###$(info USE_NVTX=$(USE_NVTX))
 
 # NB: NEW LOGIC FOR ENABLING AND DISABLING CUDA OR HIP BUILDS (AV Feb-Mar 2024)
 # - In the old implementation, by default the C++ targets for one specific AVX were always built together with either CUDA or HIP.
@@ -424,13 +440,18 @@ endif
 # (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...)
 ifeq ($(HASCURAND),)
   ifeq ($(GPUCC),) # CPU-only build
-    ifneq ($(CUDA_HOME),)
+    ifeq ($(CUDA_INC),)
+      # $(CUDA_HOME)/include/ does not exist (see #965)
+      override HASCURAND = hasNoCurand
+    else ifeq ($(wildcard $(CUDA_HOME)/include/curand.h),)
+      # $(CUDA_HOME)/include/ exists but CURAND headers do not exist? (see #965)
+      override HASCURAND = hasNoCurand
+    else
       # By default, assume that curand is installed if a CUDA installation exists
       override HASCURAND = hasCurand
-    else
-      override HASCURAND = hasNoCurand
     endif
   else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    # By default, assume that curand is installed if a CUDA build is requested
     override HASCURAND = hasCurand
   else # non-Nvidia GPU build
     override HASCURAND = hasNoCurand
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc
index 8a5b8be9c0..99efcb1dbe 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc
@@ -83,6 +83,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_( CppObjectInFortran** ppbridge,
                          const FORTRANFPTYPE* momenta,
@@ -92,18 +93,20 @@ extern "C"
                          const unsigned int* pchannelId,
                          FORTRANFPTYPE* mes,
                          int* selhel,
-                         int* selcol )
+                         int* selcol,
+                         const bool* pgoodHelOnly )
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
+    //printf("fbridgesequence_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
 #ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
-    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #else
     // Use the host/CPU implementation in the C++ library
     // (there is no device implementation in this library)
-    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #endif
   }
 
@@ -119,6 +122,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
                                         const FORTRANFPTYPE* momenta,
@@ -127,9 +131,11 @@ extern "C"
                                         const FORTRANFPTYPE* rndcol,
                                         FORTRANFPTYPE* mes,
                                         int* selhel,
-                                        int* selcol )
+                                        int* selcol,
+                                        const bool* pgoodHelOnly )
   {
-    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol );
+    //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
+    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly );
   }
 
   /**
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.inc
index 422aa67cf9..a28622cdb6 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.inc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.inc
@@ -40,10 +40,11 @@ C - CHANID:  the input Feynman diagram to enhance in multi-channel mode if 1 to
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -53,6 +54,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE
       END INTERFACE
 
@@ -66,10 +68,11 @@ C - RNDCOL:  the input random number Fortran array for color selection
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -78,6 +81,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL
       END INTERFACE
 
diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
index ae0e225418..a96bc91d5b 100644
--- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.00564265251159668 [0m
+[1;32mDEBUG: model prefixing  takes 0.005346059799194336 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -177,13 +177,13 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. 
-Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
+Generated helas calls for 1 subprocesses (2 diagrams) in 0.003 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 4 routines in  0.271 s
+ALOHA: aloha creates 4 routines in  0.264 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -202,7 +202,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. 
 quit
 
-real	0m0.674s
-user	0m0.596s
-sys	0m0.058s
+real	0m0.647s
+user	0m0.592s
+sys	0m0.048s
 Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h
index 4bf2198dd1..60eb101a6a 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h
@@ -109,9 +109,9 @@ namespace mg5amcCpu
      * @param rndcol the pointer to the input random numbers for color selection
      * @param channelId the Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0)
      * @param mes the pointer to the output matrix elements
-     * @param goodHelOnly quit after computing good helicities?
      * @param selhel the pointer to the output selected helicities
      * @param selcol the pointer to the output selected colors
+     * @param goodHelOnly quit after computing good helicities?
      */
     void gpu_sequence( const FORTRANFPTYPE* momenta,
                        const FORTRANFPTYPE* gs,
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/fcheck_sa.f b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/fcheck_sa.f
index 37d586be72..f0220047d7 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/fcheck_sa.f
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
index 359f16c029..9cff5e1a60 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
@@ -116,15 +116,31 @@ override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 # Set HIP_HOME from the path to hipcc, if it exists
 override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
 
-# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists
-# (FIXME? Is there any equivalent of NVTX FOR HIP? What should be configured if both CUDA and HIP are installed?)
-ifneq ($(CUDA_HOME),)
-  USE_NVTX ?=-DUSE_NVTX
-  CUDA_INC = -I$(CUDA_HOME)/include/
+# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
+ifeq ($(CUDA_HOME),)
+  # CUDA_HOME is empty (nvcc not found)
+  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/),)
+  # CUDA_HOME is defined (nvcc was found) but $(CUDA_HOME)/include/ does not exist?
+  override CUDA_INC=
 else
+  CUDA_INC = -I$(CUDA_HOME)/include/
+endif
+###$(info CUDA_INC=$(CUDA_INC))
+
+# Configure NVTX if a CUDA include directory exists and NVTX headers exist (see #965)
+ifeq ($(CUDA_INC),)
+  # $(CUDA_HOME)/include/ does not exist
   override USE_NVTX=
-  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/nvtx3/nvToolsExt.h),)
+  # $(CUDA_HOME)/include/ exists but NVTX headers do not exist?
+  override USE_NVTX=
+else
+  # $(CUDA_HOME)/include/nvtx.h exists: use NVTX
+  # (NB: the option to disable NVTX if 'USE_NVTX=' is defined has been removed)
+  override USE_NVTX=-DUSE_NVTX
 endif
+###$(info USE_NVTX=$(USE_NVTX))
 
 # NB: NEW LOGIC FOR ENABLING AND DISABLING CUDA OR HIP BUILDS (AV Feb-Mar 2024)
 # - In the old implementation, by default the C++ targets for one specific AVX were always built together with either CUDA or HIP.
@@ -424,13 +440,18 @@ endif
 # (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...)
 ifeq ($(HASCURAND),)
   ifeq ($(GPUCC),) # CPU-only build
-    ifneq ($(CUDA_HOME),)
+    ifeq ($(CUDA_INC),)
+      # $(CUDA_HOME)/include/ does not exist (see #965)
+      override HASCURAND = hasNoCurand
+    else ifeq ($(wildcard $(CUDA_HOME)/include/curand.h),)
+      # $(CUDA_HOME)/include/ exists but CURAND headers do not exist? (see #965)
+      override HASCURAND = hasNoCurand
+    else
       # By default, assume that curand is installed if a CUDA installation exists
       override HASCURAND = hasCurand
-    else
-      override HASCURAND = hasNoCurand
     endif
   else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    # By default, assume that curand is installed if a CUDA build is requested
     override HASCURAND = hasCurand
   else # non-Nvidia GPU build
     override HASCURAND = hasNoCurand
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc
index 8a5b8be9c0..99efcb1dbe 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc
@@ -83,6 +83,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_( CppObjectInFortran** ppbridge,
                          const FORTRANFPTYPE* momenta,
@@ -92,18 +93,20 @@ extern "C"
                          const unsigned int* pchannelId,
                          FORTRANFPTYPE* mes,
                          int* selhel,
-                         int* selcol )
+                         int* selcol,
+                         const bool* pgoodHelOnly )
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
+    //printf("fbridgesequence_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
 #ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
-    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #else
     // Use the host/CPU implementation in the C++ library
     // (there is no device implementation in this library)
-    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #endif
   }
 
@@ -119,6 +122,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
                                         const FORTRANFPTYPE* momenta,
@@ -127,9 +131,11 @@ extern "C"
                                         const FORTRANFPTYPE* rndcol,
                                         FORTRANFPTYPE* mes,
                                         int* selhel,
-                                        int* selcol )
+                                        int* selcol,
+                                        const bool* pgoodHelOnly )
   {
-    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol );
+    //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
+    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly );
   }
 
   /**
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.inc
index 422aa67cf9..a28622cdb6 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.inc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.inc
@@ -40,10 +40,11 @@ C - CHANID:  the input Feynman diagram to enhance in multi-channel mode if 1 to
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -53,6 +54,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE
       END INTERFACE
 
@@ -66,10 +68,11 @@ C - RNDCOL:  the input random number Fortran array for color selection
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -78,6 +81,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL
       END INTERFACE
 
diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index d3614c325f..b7616fe096 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005547761917114258 [0m
+[1;32mDEBUG: model prefixing  takes 0.005777120590209961 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -178,7 +178,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1152][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f60decf1ca0> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa5a393fd30> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -198,15 +198,15 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttx
 [1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1544][0m [0m
 [1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1545][0m [0m
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-Wrote files for 10 helas calls in 0.117 s
+Wrote files for 10 helas calls in 0.115 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.145 s
+ALOHA: aloha creates 2 routines in  0.146 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.139 s
+ALOHA: aloha creates 4 routines in  0.132 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -241,9 +241,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.050s
-user	0m1.662s
-sys	0m0.268s
+real	0m1.927s
+user	0m1.671s
+sys	0m0.252s
 Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
index 4bf2198dd1..60eb101a6a 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
@@ -109,9 +109,9 @@ namespace mg5amcCpu
      * @param rndcol the pointer to the input random numbers for color selection
      * @param channelId the Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0)
      * @param mes the pointer to the output matrix elements
-     * @param goodHelOnly quit after computing good helicities?
      * @param selhel the pointer to the output selected helicities
      * @param selcol the pointer to the output selected colors
+     * @param goodHelOnly quit after computing good helicities?
      */
     void gpu_sequence( const FORTRANFPTYPE* momenta,
                        const FORTRANFPTYPE* gs,
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
index f9e2335de4..d5accb9fb2 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
@@ -516,7 +516,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
       
       IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 #endif
-        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
 !$OMP PARALLEL
 !$OMP DO
         DO IVEC=1, VECSIZE_USED
@@ -532,7 +532,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
         ENDDO
 !$OMP END DO
 !$OMP END PARALLEL
-        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
+        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
       ENDIF
 
@@ -542,9 +542,10 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           STOP
         ENDIF
         IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
+          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
           FIRST = .FALSE.
 c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
           IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -558,22 +559,23 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           ENDIF
           WRITE (6,*) 'NGOODHEL =', NGOODHEL
           WRITE (6,*) 'NCOMB =', NCOMB
+          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
         ENDIF
-        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
         IF ( .NOT. MULTI_CHANNEL ) THEN
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ELSE
           IF( SDE_STRAT.NE.1 ) THEN
             WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
             STOP
           ENDIF
-          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
      &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ENDIF
-        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
+        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
       ENDIF
 
       IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f
index 37d586be72..f0220047d7 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
index 27fbe7302c..3072054f2d 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
@@ -143,7 +143,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 C     BEGIN CODE
 C     ----------
 
-      call counters_smatrix1_start()
       NTRY(IMIRROR)=NTRY(IMIRROR)+1
       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
       DO I=1,NEXTERNAL
@@ -249,7 +248,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         IHEL = HEL_PICKED
       ELSE
         ANS = 1D0
-        call counters_smatrix1_stop()
         RETURN
       ENDIF
       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
@@ -294,8 +292,9 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         ENDIF
       ENDIF
       ANS=ANS/DBLE(IDEN)
+
       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
-      call counters_smatrix1_stop()
+
       END
 
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/counters.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/counters.cc
index 742575a6a5..8ef58cce80 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/counters.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/counters.cc
@@ -21,26 +21,24 @@ extern "C"
 {
   // Now: fortran=-1, cudacpp=0
   // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
+  constexpr unsigned int nimplC = 3;
   constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
   const char* iimplC2TXT( int iimplC )
   {
     const int iimplF = iimplC - 1;
     switch( iimplF )
     {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
+      case -1: return "Fortran MEs"; break;
+      case +0: return "CudaCpp MEs"; break;
+      case +1: return "CudaCpp HEL"; break;
       default: assert( false ); break;
     }
   }
 
   static mgOnGpu::Timer<TIMERTYPE> program_timer;
   static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
   static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
   static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
   static int smatrix1multi_counter[nimplC] = { 0 };
 
   void counters_initialise_()
@@ -49,19 +47,6 @@ extern "C"
     return;
   }
 
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
   void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
   {
     const unsigned int iimplC = iimplF2C( *iimplF );
@@ -86,13 +71,23 @@ extern "C"
     printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
     printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
     for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
+    {
       if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+      {
+        if( iimplC < nimplC - 1 ) // MEs
+          printf( " [COUNTERS] %11s      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
+                  iimplC2TXT( iimplC ),
+                  iimplC + 1,
+                  smatrix1multi_totaltime[iimplC],
+                  smatrix1multi_counter[iimplC],
+                  smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+        else
+          printf( " [COUNTERS] %11s      ( %1d ) : %9.4fs\n",
+                  iimplC2TXT( iimplC ),
+                  iimplC + 1,
+                  smatrix1multi_totaltime[iimplC] );
+      }
+    }
     return;
   }
 }
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
index 359f16c029..9cff5e1a60 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
@@ -116,15 +116,31 @@ override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 # Set HIP_HOME from the path to hipcc, if it exists
 override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
 
-# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists
-# (FIXME? Is there any equivalent of NVTX FOR HIP? What should be configured if both CUDA and HIP are installed?)
-ifneq ($(CUDA_HOME),)
-  USE_NVTX ?=-DUSE_NVTX
-  CUDA_INC = -I$(CUDA_HOME)/include/
+# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
+ifeq ($(CUDA_HOME),)
+  # CUDA_HOME is empty (nvcc not found)
+  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/),)
+  # CUDA_HOME is defined (nvcc was found) but $(CUDA_HOME)/include/ does not exist?
+  override CUDA_INC=
 else
+  CUDA_INC = -I$(CUDA_HOME)/include/
+endif
+###$(info CUDA_INC=$(CUDA_INC))
+
+# Configure NVTX if a CUDA include directory exists and NVTX headers exist (see #965)
+ifeq ($(CUDA_INC),)
+  # $(CUDA_HOME)/include/ does not exist
   override USE_NVTX=
-  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/nvtx3/nvToolsExt.h),)
+  # $(CUDA_HOME)/include/ exists but NVTX headers do not exist?
+  override USE_NVTX=
+else
+  # $(CUDA_HOME)/include/nvtx.h exists: use NVTX
+  # (NB: the option to disable NVTX if 'USE_NVTX=' is defined has been removed)
+  override USE_NVTX=-DUSE_NVTX
 endif
+###$(info USE_NVTX=$(USE_NVTX))
 
 # NB: NEW LOGIC FOR ENABLING AND DISABLING CUDA OR HIP BUILDS (AV Feb-Mar 2024)
 # - In the old implementation, by default the C++ targets for one specific AVX were always built together with either CUDA or HIP.
@@ -424,13 +440,18 @@ endif
 # (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...)
 ifeq ($(HASCURAND),)
   ifeq ($(GPUCC),) # CPU-only build
-    ifneq ($(CUDA_HOME),)
+    ifeq ($(CUDA_INC),)
+      # $(CUDA_HOME)/include/ does not exist (see #965)
+      override HASCURAND = hasNoCurand
+    else ifeq ($(wildcard $(CUDA_HOME)/include/curand.h),)
+      # $(CUDA_HOME)/include/ exists but CURAND headers do not exist? (see #965)
+      override HASCURAND = hasNoCurand
+    else
       # By default, assume that curand is installed if a CUDA installation exists
       override HASCURAND = hasCurand
-    else
-      override HASCURAND = hasNoCurand
     endif
   else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    # By default, assume that curand is installed if a CUDA build is requested
     override HASCURAND = hasCurand
   else # non-Nvidia GPU build
     override HASCURAND = hasNoCurand
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc
index 8a5b8be9c0..99efcb1dbe 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc
@@ -83,6 +83,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_( CppObjectInFortran** ppbridge,
                          const FORTRANFPTYPE* momenta,
@@ -92,18 +93,20 @@ extern "C"
                          const unsigned int* pchannelId,
                          FORTRANFPTYPE* mes,
                          int* selhel,
-                         int* selcol )
+                         int* selcol,
+                         const bool* pgoodHelOnly )
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
+    //printf("fbridgesequence_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
 #ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
-    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #else
     // Use the host/CPU implementation in the C++ library
     // (there is no device implementation in this library)
-    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #endif
   }
 
@@ -119,6 +122,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
                                         const FORTRANFPTYPE* momenta,
@@ -127,9 +131,11 @@ extern "C"
                                         const FORTRANFPTYPE* rndcol,
                                         FORTRANFPTYPE* mes,
                                         int* selhel,
-                                        int* selcol )
+                                        int* selcol,
+                                        const bool* pgoodHelOnly )
   {
-    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol );
+    //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
+    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly );
   }
 
   /**
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.inc
index 422aa67cf9..a28622cdb6 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.inc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.inc
@@ -40,10 +40,11 @@ C - CHANID:  the input Feynman diagram to enhance in multi-channel mode if 1 to
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -53,6 +54,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE
       END INTERFACE
 
@@ -66,10 +68,11 @@ C - RNDCOL:  the input random number Fortran array for color selection
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -78,6 +81,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL
       END INTERFACE
 
diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
index 5f921c39c6..b84f753a35 100644
--- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005394935607910156 [0m
+[1;32mDEBUG: model prefixing  takes 0.005595207214355469 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
 INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.009 s
+1 processes with 3 diagrams generated in 0.008 s
 Total: 1 processes with 3 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt
 Load PLUGIN.CUDACPP_OUTPUT
@@ -182,7 +182,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.145 s
+ALOHA: aloha creates 2 routines in  0.144 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -197,7 +197,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
 quit
 
-real	0m0.549s
-user	0m0.474s
-sys	0m0.058s
-Code generation completed in 0 seconds
+real	0m0.556s
+user	0m0.475s
+sys	0m0.048s
+Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h
index 4bf2198dd1..60eb101a6a 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h
@@ -109,9 +109,9 @@ namespace mg5amcCpu
      * @param rndcol the pointer to the input random numbers for color selection
      * @param channelId the Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0)
      * @param mes the pointer to the output matrix elements
-     * @param goodHelOnly quit after computing good helicities?
      * @param selhel the pointer to the output selected helicities
      * @param selcol the pointer to the output selected colors
+     * @param goodHelOnly quit after computing good helicities?
      */
     void gpu_sequence( const FORTRANFPTYPE* momenta,
                        const FORTRANFPTYPE* gs,
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/fcheck_sa.f b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/fcheck_sa.f
index 37d586be72..f0220047d7 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/fcheck_sa.f
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
index 359f16c029..9cff5e1a60 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
@@ -116,15 +116,31 @@ override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 # Set HIP_HOME from the path to hipcc, if it exists
 override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
 
-# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists
-# (FIXME? Is there any equivalent of NVTX FOR HIP? What should be configured if both CUDA and HIP are installed?)
-ifneq ($(CUDA_HOME),)
-  USE_NVTX ?=-DUSE_NVTX
-  CUDA_INC = -I$(CUDA_HOME)/include/
+# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
+ifeq ($(CUDA_HOME),)
+  # CUDA_HOME is empty (nvcc not found)
+  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/),)
+  # CUDA_HOME is defined (nvcc was found) but $(CUDA_HOME)/include/ does not exist?
+  override CUDA_INC=
 else
+  CUDA_INC = -I$(CUDA_HOME)/include/
+endif
+###$(info CUDA_INC=$(CUDA_INC))
+
+# Configure NVTX if a CUDA include directory exists and NVTX headers exist (see #965)
+ifeq ($(CUDA_INC),)
+  # $(CUDA_HOME)/include/ does not exist
   override USE_NVTX=
-  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/nvtx3/nvToolsExt.h),)
+  # $(CUDA_HOME)/include/ exists but NVTX headers do not exist?
+  override USE_NVTX=
+else
+  # $(CUDA_HOME)/include/nvtx.h exists: use NVTX
+  # (NB: the option to disable NVTX if 'USE_NVTX=' is defined has been removed)
+  override USE_NVTX=-DUSE_NVTX
 endif
+###$(info USE_NVTX=$(USE_NVTX))
 
 # NB: NEW LOGIC FOR ENABLING AND DISABLING CUDA OR HIP BUILDS (AV Feb-Mar 2024)
 # - In the old implementation, by default the C++ targets for one specific AVX were always built together with either CUDA or HIP.
@@ -424,13 +440,18 @@ endif
 # (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...)
 ifeq ($(HASCURAND),)
   ifeq ($(GPUCC),) # CPU-only build
-    ifneq ($(CUDA_HOME),)
+    ifeq ($(CUDA_INC),)
+      # $(CUDA_HOME)/include/ does not exist (see #965)
+      override HASCURAND = hasNoCurand
+    else ifeq ($(wildcard $(CUDA_HOME)/include/curand.h),)
+      # $(CUDA_HOME)/include/ exists but CURAND headers do not exist? (see #965)
+      override HASCURAND = hasNoCurand
+    else
       # By default, assume that curand is installed if a CUDA installation exists
       override HASCURAND = hasCurand
-    else
-      override HASCURAND = hasNoCurand
     endif
   else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    # By default, assume that curand is installed if a CUDA build is requested
     override HASCURAND = hasCurand
   else # non-Nvidia GPU build
     override HASCURAND = hasNoCurand
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc
index 8a5b8be9c0..99efcb1dbe 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc
@@ -83,6 +83,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_( CppObjectInFortran** ppbridge,
                          const FORTRANFPTYPE* momenta,
@@ -92,18 +93,20 @@ extern "C"
                          const unsigned int* pchannelId,
                          FORTRANFPTYPE* mes,
                          int* selhel,
-                         int* selcol )
+                         int* selcol,
+                         const bool* pgoodHelOnly )
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
+    //printf("fbridgesequence_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
 #ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
-    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #else
     // Use the host/CPU implementation in the C++ library
     // (there is no device implementation in this library)
-    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #endif
   }
 
@@ -119,6 +122,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
                                         const FORTRANFPTYPE* momenta,
@@ -127,9 +131,11 @@ extern "C"
                                         const FORTRANFPTYPE* rndcol,
                                         FORTRANFPTYPE* mes,
                                         int* selhel,
-                                        int* selcol )
+                                        int* selcol,
+                                        const bool* pgoodHelOnly )
   {
-    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol );
+    //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
+    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly );
   }
 
   /**
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.inc
index 422aa67cf9..a28622cdb6 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.inc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.inc
@@ -40,10 +40,11 @@ C - CHANID:  the input Feynman diagram to enhance in multi-channel mode if 1 to
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -53,6 +54,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE
       END INTERFACE
 
@@ -66,10 +68,11 @@ C - RNDCOL:  the input random number Fortran array for color selection
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -78,6 +81,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL
       END INTERFACE
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
index 2ea2a5346a..7fabd11d28 100644
--- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
+++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0055332183837890625 [0m
+[1;32mDEBUG: model prefixing  takes 0.005646228790283203 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
 INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.008 s
+1 processes with 3 diagrams generated in 0.009 s
 Total: 1 processes with 3 diagrams
 add process g g > t t~ g
 INFO: Checking for minimal orders which gives processes. 
@@ -163,7 +163,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.020 s
+1 processes with 16 diagrams generated in 0.019 s
 Total: 2 processes with 19 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32
 Load PLUGIN.CUDACPP_OUTPUT
@@ -188,7 +188,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P2_gg_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1152][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f62f9ff2ee0> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f2f6d99dc70> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -209,7 +209,7 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttxg
 [1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1545][0m [0m
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1152][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f62f9ff2ee0> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f2f6d99dc70> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -228,22 +228,22 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttx
 [1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1520][0m [0m
 [1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1544][0m [0m
 [1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1545][0m [0m
-Generated helas calls for 2 subprocesses (19 diagrams) in 0.042 s
-Wrote files for 46 helas calls in 0.281 s
+Generated helas calls for 2 subprocesses (19 diagrams) in 0.043 s
+Wrote files for 46 helas calls in 0.275 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.336 s
+ALOHA: aloha creates 5 routines in  0.331 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.317 s
+ALOHA: aloha creates 10 routines in  0.315 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -291,10 +291,10 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m3.722s
-user	0m2.407s
-sys	0m0.292s
-Code generation completed in 4 seconds
+real	0m2.676s
+user	0m2.362s
+sys	0m0.310s
+Code generation completed in 3 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h
index 4bf2198dd1..60eb101a6a 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h
@@ -109,9 +109,9 @@ namespace mg5amcCpu
      * @param rndcol the pointer to the input random numbers for color selection
      * @param channelId the Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0)
      * @param mes the pointer to the output matrix elements
-     * @param goodHelOnly quit after computing good helicities?
      * @param selhel the pointer to the output selected helicities
      * @param selcol the pointer to the output selected colors
+     * @param goodHelOnly quit after computing good helicities?
      */
     void gpu_sequence( const FORTRANFPTYPE* momenta,
                        const FORTRANFPTYPE* gs,
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
index f9e2335de4..d5accb9fb2 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
@@ -516,7 +516,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
       
       IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 #endif
-        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
 !$OMP PARALLEL
 !$OMP DO
         DO IVEC=1, VECSIZE_USED
@@ -532,7 +532,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
         ENDDO
 !$OMP END DO
 !$OMP END PARALLEL
-        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
+        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
       ENDIF
 
@@ -542,9 +542,10 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           STOP
         ENDIF
         IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
+          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
           FIRST = .FALSE.
 c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
           IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -558,22 +559,23 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           ENDIF
           WRITE (6,*) 'NGOODHEL =', NGOODHEL
           WRITE (6,*) 'NCOMB =', NCOMB
+          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
         ENDIF
-        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
         IF ( .NOT. MULTI_CHANNEL ) THEN
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ELSE
           IF( SDE_STRAT.NE.1 ) THEN
             WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
             STOP
           ENDIF
-          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
      &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ENDIF
-        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
+        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
       ENDIF
 
       IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f
index 37d586be72..f0220047d7 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f
index 27fbe7302c..3072054f2d 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f
@@ -143,7 +143,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 C     BEGIN CODE
 C     ----------
 
-      call counters_smatrix1_start()
       NTRY(IMIRROR)=NTRY(IMIRROR)+1
       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
       DO I=1,NEXTERNAL
@@ -249,7 +248,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         IHEL = HEL_PICKED
       ELSE
         ANS = 1D0
-        call counters_smatrix1_stop()
         RETURN
       ENDIF
       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
@@ -294,8 +292,9 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         ENDIF
       ENDIF
       ANS=ANS/DBLE(IDEN)
+
       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
-      call counters_smatrix1_stop()
+
       END
 
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f
index 29cee23b2e..3b6a3f178d 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f
@@ -516,7 +516,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
       
       IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 #endif
-        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
 !$OMP PARALLEL
 !$OMP DO
         DO IVEC=1, VECSIZE_USED
@@ -532,7 +532,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
         ENDDO
 !$OMP END DO
 !$OMP END PARALLEL
-        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
+        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
       ENDIF
 
@@ -542,9 +542,10 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           STOP
         ENDIF
         IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
+          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
           FIRST = .FALSE.
 c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
           IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -558,22 +559,23 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           ENDIF
           WRITE (6,*) 'NGOODHEL =', NGOODHEL
           WRITE (6,*) 'NCOMB =', NCOMB
+          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
         ENDIF
-        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
         IF ( .NOT. MULTI_CHANNEL ) THEN
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ELSE
           IF( SDE_STRAT.NE.1 ) THEN
             WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
             STOP
           ENDIF
-          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
      &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ENDIF
-        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
+        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
       ENDIF
 
       IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/fcheck_sa.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/fcheck_sa.f
index 9170a32a19..cb7efdfbcf 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/fcheck_sa.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f
index b13c503fae..1dd3491413 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f
@@ -159,7 +159,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 C     BEGIN CODE
 C     ----------
 
-      call counters_smatrix1_start()
       NTRY(IMIRROR)=NTRY(IMIRROR)+1
       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
       DO I=1,NEXTERNAL
@@ -265,7 +264,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         IHEL = HEL_PICKED
       ELSE
         ANS = 1D0
-        call counters_smatrix1_stop()
         RETURN
       ENDIF
       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
@@ -310,8 +308,9 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         ENDIF
       ENDIF
       ANS=ANS/DBLE(IDEN)
+
       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
-      call counters_smatrix1_stop()
+
       END
 
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/counters.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/counters.cc
index 742575a6a5..8ef58cce80 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/counters.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/counters.cc
@@ -21,26 +21,24 @@ extern "C"
 {
   // Now: fortran=-1, cudacpp=0
   // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
+  constexpr unsigned int nimplC = 3;
   constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
   const char* iimplC2TXT( int iimplC )
   {
     const int iimplF = iimplC - 1;
     switch( iimplF )
     {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
+      case -1: return "Fortran MEs"; break;
+      case +0: return "CudaCpp MEs"; break;
+      case +1: return "CudaCpp HEL"; break;
       default: assert( false ); break;
     }
   }
 
   static mgOnGpu::Timer<TIMERTYPE> program_timer;
   static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
   static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
   static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
   static int smatrix1multi_counter[nimplC] = { 0 };
 
   void counters_initialise_()
@@ -49,19 +47,6 @@ extern "C"
     return;
   }
 
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
   void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
   {
     const unsigned int iimplC = iimplF2C( *iimplF );
@@ -86,13 +71,23 @@ extern "C"
     printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
     printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
     for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
+    {
       if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+      {
+        if( iimplC < nimplC - 1 ) // MEs
+          printf( " [COUNTERS] %11s      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
+                  iimplC2TXT( iimplC ),
+                  iimplC + 1,
+                  smatrix1multi_totaltime[iimplC],
+                  smatrix1multi_counter[iimplC],
+                  smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+        else
+          printf( " [COUNTERS] %11s      ( %1d ) : %9.4fs\n",
+                  iimplC2TXT( iimplC ),
+                  iimplC + 1,
+                  smatrix1multi_totaltime[iimplC] );
+      }
+    }
     return;
   }
 }
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
index 359f16c029..9cff5e1a60 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
@@ -116,15 +116,31 @@ override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 # Set HIP_HOME from the path to hipcc, if it exists
 override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
 
-# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists
-# (FIXME? Is there any equivalent of NVTX FOR HIP? What should be configured if both CUDA and HIP are installed?)
-ifneq ($(CUDA_HOME),)
-  USE_NVTX ?=-DUSE_NVTX
-  CUDA_INC = -I$(CUDA_HOME)/include/
+# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
+ifeq ($(CUDA_HOME),)
+  # CUDA_HOME is empty (nvcc not found)
+  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/),)
+  # CUDA_HOME is defined (nvcc was found) but $(CUDA_HOME)/include/ does not exist?
+  override CUDA_INC=
 else
+  CUDA_INC = -I$(CUDA_HOME)/include/
+endif
+###$(info CUDA_INC=$(CUDA_INC))
+
+# Configure NVTX if a CUDA include directory exists and NVTX headers exist (see #965)
+ifeq ($(CUDA_INC),)
+  # $(CUDA_HOME)/include/ does not exist
   override USE_NVTX=
-  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/nvtx3/nvToolsExt.h),)
+  # $(CUDA_HOME)/include/ exists but NVTX headers do not exist?
+  override USE_NVTX=
+else
+  # $(CUDA_HOME)/include/nvtx.h exists: use NVTX
+  # (NB: the option to disable NVTX if 'USE_NVTX=' is defined has been removed)
+  override USE_NVTX=-DUSE_NVTX
 endif
+###$(info USE_NVTX=$(USE_NVTX))
 
 # NB: NEW LOGIC FOR ENABLING AND DISABLING CUDA OR HIP BUILDS (AV Feb-Mar 2024)
 # - In the old implementation, by default the C++ targets for one specific AVX were always built together with either CUDA or HIP.
@@ -424,13 +440,18 @@ endif
 # (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...)
 ifeq ($(HASCURAND),)
   ifeq ($(GPUCC),) # CPU-only build
-    ifneq ($(CUDA_HOME),)
+    ifeq ($(CUDA_INC),)
+      # $(CUDA_HOME)/include/ does not exist (see #965)
+      override HASCURAND = hasNoCurand
+    else ifeq ($(wildcard $(CUDA_HOME)/include/curand.h),)
+      # $(CUDA_HOME)/include/ exists but CURAND headers do not exist? (see #965)
+      override HASCURAND = hasNoCurand
+    else
       # By default, assume that curand is installed if a CUDA installation exists
       override HASCURAND = hasCurand
-    else
-      override HASCURAND = hasNoCurand
     endif
   else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    # By default, assume that curand is installed if a CUDA build is requested
     override HASCURAND = hasCurand
   else # non-Nvidia GPU build
     override HASCURAND = hasNoCurand
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc
index 8a5b8be9c0..99efcb1dbe 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc
@@ -83,6 +83,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_( CppObjectInFortran** ppbridge,
                          const FORTRANFPTYPE* momenta,
@@ -92,18 +93,20 @@ extern "C"
                          const unsigned int* pchannelId,
                          FORTRANFPTYPE* mes,
                          int* selhel,
-                         int* selcol )
+                         int* selcol,
+                         const bool* pgoodHelOnly )
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
+    //printf("fbridgesequence_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
 #ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
-    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #else
     // Use the host/CPU implementation in the C++ library
     // (there is no device implementation in this library)
-    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #endif
   }
 
@@ -119,6 +122,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
                                         const FORTRANFPTYPE* momenta,
@@ -127,9 +131,11 @@ extern "C"
                                         const FORTRANFPTYPE* rndcol,
                                         FORTRANFPTYPE* mes,
                                         int* selhel,
-                                        int* selcol )
+                                        int* selcol,
+                                        const bool* pgoodHelOnly )
   {
-    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol );
+    //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
+    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly );
   }
 
   /**
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.inc
index 422aa67cf9..a28622cdb6 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.inc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.inc
@@ -40,10 +40,11 @@ C - CHANID:  the input Feynman diagram to enhance in multi-channel mode if 1 to
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -53,6 +54,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE
       END INTERFACE
 
@@ -66,10 +68,11 @@ C - RNDCOL:  the input random number Fortran array for color selection
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -78,6 +81,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL
       END INTERFACE
 
diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
index dc2276a50d..18b1d80415 100644
--- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005425453186035156 [0m
+[1;32mDEBUG: model prefixing  takes 0.005260467529296875 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.022 s
+1 processes with 16 diagrams generated in 0.021 s
 Total: 1 processes with 16 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=32
 Load PLUGIN.CUDACPP_OUTPUT
@@ -178,7 +178,7 @@ INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1
 INFO: Processing color information for process: g g > t t~ g @1 
 INFO: Creating files in directory P1_gg_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1152][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f4b85cb1e50> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1716271c70> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -198,21 +198,21 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttxg
 [1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1544][0m [0m
 [1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1545][0m [0m
 Generated helas calls for 1 subprocesses (16 diagrams) in 0.039 s
-Wrote files for 36 helas calls in 0.165 s
+Wrote files for 36 helas calls in 0.162 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.330 s
+ALOHA: aloha creates 5 routines in  0.322 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.317 s
+ALOHA: aloha creates 10 routines in  0.308 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -256,9 +256,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.506s
-user	0m2.207s
-sys	0m0.271s
+real	0m2.483s
+user	0m2.197s
+sys	0m0.283s
 Code generation completed in 3 seconds
 ************************************************************
 *                                                          *
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h
index 4bf2198dd1..60eb101a6a 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h
@@ -109,9 +109,9 @@ namespace mg5amcCpu
      * @param rndcol the pointer to the input random numbers for color selection
      * @param channelId the Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0)
      * @param mes the pointer to the output matrix elements
-     * @param goodHelOnly quit after computing good helicities?
      * @param selhel the pointer to the output selected helicities
      * @param selcol the pointer to the output selected colors
+     * @param goodHelOnly quit after computing good helicities?
      */
     void gpu_sequence( const FORTRANFPTYPE* momenta,
                        const FORTRANFPTYPE* gs,
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
index c9ca1538d3..1c3ba92e6d 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
@@ -516,7 +516,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
       
       IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 #endif
-        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
 !$OMP PARALLEL
 !$OMP DO
         DO IVEC=1, VECSIZE_USED
@@ -532,7 +532,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
         ENDDO
 !$OMP END DO
 !$OMP END PARALLEL
-        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
+        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
       ENDIF
 
@@ -542,9 +542,10 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           STOP
         ENDIF
         IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
+          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
           FIRST = .FALSE.
 c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
           IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -558,22 +559,23 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           ENDIF
           WRITE (6,*) 'NGOODHEL =', NGOODHEL
           WRITE (6,*) 'NCOMB =', NCOMB
+          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
         ENDIF
-        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
         IF ( .NOT. MULTI_CHANNEL ) THEN
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ELSE
           IF( SDE_STRAT.NE.1 ) THEN
             WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
             STOP
           ENDIF
-          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
      &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ENDIF
-        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
+        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
       ENDIF
 
       IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/fcheck_sa.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/fcheck_sa.f
index 9170a32a19..cb7efdfbcf 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/fcheck_sa.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f
index 3d035277eb..6fdf8a8d07 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f
@@ -159,7 +159,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 C     BEGIN CODE
 C     ----------
 
-      call counters_smatrix1_start()
       NTRY(IMIRROR)=NTRY(IMIRROR)+1
       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
       DO I=1,NEXTERNAL
@@ -265,7 +264,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         IHEL = HEL_PICKED
       ELSE
         ANS = 1D0
-        call counters_smatrix1_stop()
         RETURN
       ENDIF
       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
@@ -310,8 +308,9 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         ENDIF
       ENDIF
       ANS=ANS/DBLE(IDEN)
+
       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
-      call counters_smatrix1_stop()
+
       END
 
 
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/counters.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/counters.cc
index 742575a6a5..8ef58cce80 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/counters.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/counters.cc
@@ -21,26 +21,24 @@ extern "C"
 {
   // Now: fortran=-1, cudacpp=0
   // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
+  constexpr unsigned int nimplC = 3;
   constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
   const char* iimplC2TXT( int iimplC )
   {
     const int iimplF = iimplC - 1;
     switch( iimplF )
     {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
+      case -1: return "Fortran MEs"; break;
+      case +0: return "CudaCpp MEs"; break;
+      case +1: return "CudaCpp HEL"; break;
       default: assert( false ); break;
     }
   }
 
   static mgOnGpu::Timer<TIMERTYPE> program_timer;
   static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
   static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
   static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
   static int smatrix1multi_counter[nimplC] = { 0 };
 
   void counters_initialise_()
@@ -49,19 +47,6 @@ extern "C"
     return;
   }
 
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
   void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
   {
     const unsigned int iimplC = iimplF2C( *iimplF );
@@ -86,13 +71,23 @@ extern "C"
     printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
     printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
     for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
+    {
       if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+      {
+        if( iimplC < nimplC - 1 ) // MEs
+          printf( " [COUNTERS] %11s      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
+                  iimplC2TXT( iimplC ),
+                  iimplC + 1,
+                  smatrix1multi_totaltime[iimplC],
+                  smatrix1multi_counter[iimplC],
+                  smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+        else
+          printf( " [COUNTERS] %11s      ( %1d ) : %9.4fs\n",
+                  iimplC2TXT( iimplC ),
+                  iimplC + 1,
+                  smatrix1multi_totaltime[iimplC] );
+      }
+    }
     return;
   }
 }
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
index 359f16c029..9cff5e1a60 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
@@ -116,15 +116,31 @@ override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 # Set HIP_HOME from the path to hipcc, if it exists
 override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
 
-# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists
-# (FIXME? Is there any equivalent of NVTX FOR HIP? What should be configured if both CUDA and HIP are installed?)
-ifneq ($(CUDA_HOME),)
-  USE_NVTX ?=-DUSE_NVTX
-  CUDA_INC = -I$(CUDA_HOME)/include/
+# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
+ifeq ($(CUDA_HOME),)
+  # CUDA_HOME is empty (nvcc not found)
+  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/),)
+  # CUDA_HOME is defined (nvcc was found) but $(CUDA_HOME)/include/ does not exist?
+  override CUDA_INC=
 else
+  CUDA_INC = -I$(CUDA_HOME)/include/
+endif
+###$(info CUDA_INC=$(CUDA_INC))
+
+# Configure NVTX if a CUDA include directory exists and NVTX headers exist (see #965)
+ifeq ($(CUDA_INC),)
+  # $(CUDA_HOME)/include/ does not exist
   override USE_NVTX=
-  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/nvtx3/nvToolsExt.h),)
+  # $(CUDA_HOME)/include/ exists but NVTX headers do not exist?
+  override USE_NVTX=
+else
+  # $(CUDA_HOME)/include/nvtx.h exists: use NVTX
+  # (NB: the option to disable NVTX if 'USE_NVTX=' is defined has been removed)
+  override USE_NVTX=-DUSE_NVTX
 endif
+###$(info USE_NVTX=$(USE_NVTX))
 
 # NB: NEW LOGIC FOR ENABLING AND DISABLING CUDA OR HIP BUILDS (AV Feb-Mar 2024)
 # - In the old implementation, by default the C++ targets for one specific AVX were always built together with either CUDA or HIP.
@@ -424,13 +440,18 @@ endif
 # (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...)
 ifeq ($(HASCURAND),)
   ifeq ($(GPUCC),) # CPU-only build
-    ifneq ($(CUDA_HOME),)
+    ifeq ($(CUDA_INC),)
+      # $(CUDA_HOME)/include/ does not exist (see #965)
+      override HASCURAND = hasNoCurand
+    else ifeq ($(wildcard $(CUDA_HOME)/include/curand.h),)
+      # $(CUDA_HOME)/include/ exists but CURAND headers do not exist? (see #965)
+      override HASCURAND = hasNoCurand
+    else
       # By default, assume that curand is installed if a CUDA installation exists
       override HASCURAND = hasCurand
-    else
-      override HASCURAND = hasNoCurand
     endif
   else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    # By default, assume that curand is installed if a CUDA build is requested
     override HASCURAND = hasCurand
   else # non-Nvidia GPU build
     override HASCURAND = hasNoCurand
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc
index 8a5b8be9c0..99efcb1dbe 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc
@@ -83,6 +83,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_( CppObjectInFortran** ppbridge,
                          const FORTRANFPTYPE* momenta,
@@ -92,18 +93,20 @@ extern "C"
                          const unsigned int* pchannelId,
                          FORTRANFPTYPE* mes,
                          int* selhel,
-                         int* selcol )
+                         int* selcol,
+                         const bool* pgoodHelOnly )
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
+    //printf("fbridgesequence_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
 #ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
-    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #else
     // Use the host/CPU implementation in the C++ library
     // (there is no device implementation in this library)
-    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #endif
   }
 
@@ -119,6 +122,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
                                         const FORTRANFPTYPE* momenta,
@@ -127,9 +131,11 @@ extern "C"
                                         const FORTRANFPTYPE* rndcol,
                                         FORTRANFPTYPE* mes,
                                         int* selhel,
-                                        int* selcol )
+                                        int* selcol,
+                                        const bool* pgoodHelOnly )
   {
-    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol );
+    //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
+    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly );
   }
 
   /**
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.inc
index 422aa67cf9..a28622cdb6 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.inc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.inc
@@ -40,10 +40,11 @@ C - CHANID:  the input Feynman diagram to enhance in multi-channel mode if 1 to
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -53,6 +54,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE
       END INTERFACE
 
@@ -66,10 +68,11 @@ C - RNDCOL:  the input random number Fortran array for color selection
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -78,6 +81,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL
       END INTERFACE
 
diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
index 433938fa3c..a103152d0f 100644
--- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005648612976074219 [0m
+[1;32mDEBUG: model prefixing  takes 0.00570988655090332 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.022 s
+1 processes with 16 diagrams generated in 0.021 s
 Total: 1 processes with 16 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg
 Load PLUGIN.CUDACPP_OUTPUT
@@ -178,14 +178,14 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. 
-Generated helas calls for 1 subprocesses (16 diagrams) in 0.038 s
+Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.327 s
+ALOHA: aloha creates 5 routines in  0.323 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -205,7 +205,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. 
 quit
 
-real	0m0.818s
-user	0m0.725s
-sys	0m0.053s
+real	0m0.774s
+user	0m0.711s
+sys	0m0.055s
 Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h
index 4bf2198dd1..60eb101a6a 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h
@@ -109,9 +109,9 @@ namespace mg5amcCpu
      * @param rndcol the pointer to the input random numbers for color selection
      * @param channelId the Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0)
      * @param mes the pointer to the output matrix elements
-     * @param goodHelOnly quit after computing good helicities?
      * @param selhel the pointer to the output selected helicities
      * @param selcol the pointer to the output selected colors
+     * @param goodHelOnly quit after computing good helicities?
      */
     void gpu_sequence( const FORTRANFPTYPE* momenta,
                        const FORTRANFPTYPE* gs,
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/fcheck_sa.f b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/fcheck_sa.f
index 9170a32a19..cb7efdfbcf 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/fcheck_sa.f
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
index 359f16c029..9cff5e1a60 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
@@ -116,15 +116,31 @@ override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 # Set HIP_HOME from the path to hipcc, if it exists
 override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
 
-# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists
-# (FIXME? Is there any equivalent of NVTX FOR HIP? What should be configured if both CUDA and HIP are installed?)
-ifneq ($(CUDA_HOME),)
-  USE_NVTX ?=-DUSE_NVTX
-  CUDA_INC = -I$(CUDA_HOME)/include/
+# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
+ifeq ($(CUDA_HOME),)
+  # CUDA_HOME is empty (nvcc not found)
+  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/),)
+  # CUDA_HOME is defined (nvcc was found) but $(CUDA_HOME)/include/ does not exist?
+  override CUDA_INC=
 else
+  CUDA_INC = -I$(CUDA_HOME)/include/
+endif
+###$(info CUDA_INC=$(CUDA_INC))
+
+# Configure NVTX if a CUDA include directory exists and NVTX headers exist (see #965)
+ifeq ($(CUDA_INC),)
+  # $(CUDA_HOME)/include/ does not exist
   override USE_NVTX=
-  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/nvtx3/nvToolsExt.h),)
+  # $(CUDA_HOME)/include/ exists but NVTX headers do not exist?
+  override USE_NVTX=
+else
+  # $(CUDA_HOME)/include/nvtx.h exists: use NVTX
+  # (NB: the option to disable NVTX if 'USE_NVTX=' is defined has been removed)
+  override USE_NVTX=-DUSE_NVTX
 endif
+###$(info USE_NVTX=$(USE_NVTX))
 
 # NB: NEW LOGIC FOR ENABLING AND DISABLING CUDA OR HIP BUILDS (AV Feb-Mar 2024)
 # - In the old implementation, by default the C++ targets for one specific AVX were always built together with either CUDA or HIP.
@@ -424,13 +440,18 @@ endif
 # (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...)
 ifeq ($(HASCURAND),)
   ifeq ($(GPUCC),) # CPU-only build
-    ifneq ($(CUDA_HOME),)
+    ifeq ($(CUDA_INC),)
+      # $(CUDA_HOME)/include/ does not exist (see #965)
+      override HASCURAND = hasNoCurand
+    else ifeq ($(wildcard $(CUDA_HOME)/include/curand.h),)
+      # $(CUDA_HOME)/include/ exists but CURAND headers do not exist? (see #965)
+      override HASCURAND = hasNoCurand
+    else
       # By default, assume that curand is installed if a CUDA installation exists
       override HASCURAND = hasCurand
-    else
-      override HASCURAND = hasNoCurand
     endif
   else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    # By default, assume that curand is installed if a CUDA build is requested
     override HASCURAND = hasCurand
   else # non-Nvidia GPU build
     override HASCURAND = hasNoCurand
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc
index 8a5b8be9c0..99efcb1dbe 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc
@@ -83,6 +83,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_( CppObjectInFortran** ppbridge,
                          const FORTRANFPTYPE* momenta,
@@ -92,18 +93,20 @@ extern "C"
                          const unsigned int* pchannelId,
                          FORTRANFPTYPE* mes,
                          int* selhel,
-                         int* selcol )
+                         int* selcol,
+                         const bool* pgoodHelOnly )
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
+    //printf("fbridgesequence_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
 #ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
-    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #else
     // Use the host/CPU implementation in the C++ library
     // (there is no device implementation in this library)
-    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #endif
   }
 
@@ -119,6 +122,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
                                         const FORTRANFPTYPE* momenta,
@@ -127,9 +131,11 @@ extern "C"
                                         const FORTRANFPTYPE* rndcol,
                                         FORTRANFPTYPE* mes,
                                         int* selhel,
-                                        int* selcol )
+                                        int* selcol,
+                                        const bool* pgoodHelOnly )
   {
-    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol );
+    //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
+    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly );
   }
 
   /**
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.inc
index 422aa67cf9..a28622cdb6 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.inc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.inc
@@ -40,10 +40,11 @@ C - CHANID:  the input Feynman diagram to enhance in multi-channel mode if 1 to
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -53,6 +54,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE
       END INTERFACE
 
@@ -66,10 +68,11 @@ C - RNDCOL:  the input random number Fortran array for color selection
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -78,6 +81,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL
       END INTERFACE
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
index 8412f20e64..816c1d75f7 100644
--- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0055027008056640625 [0m
+[1;32mDEBUG: model prefixing  takes 0.0055654048919677734 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.160 s
+1 processes with 123 diagrams generated in 0.156 s
 Total: 1 processes with 123 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32
 Load PLUGIN.CUDACPP_OUTPUT
@@ -178,7 +178,7 @@ INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1
 INFO: Processing color information for process: g g > t t~ g g @1 
 INFO: Creating files in directory P1_gg_ttxgg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1152][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f89395827f0> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fce7b612ca0> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -197,22 +197,22 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttxgg
 [1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 105 [1;30m[model_handling.py at line 1520][0m [0m
 [1;32mDEBUG:  iconfig_to_diag = [0m {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [1;30m[model_handling.py at line 1544][0m [0m
 [1;32mDEBUG:  diag_to_iconfig = [0m {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [1;30m[model_handling.py at line 1545][0m [0m
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.430 s
-Wrote files for 222 helas calls in 0.712 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.428 s
+Wrote files for 222 helas calls in 0.706 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.334 s
+ALOHA: aloha creates 5 routines in  0.333 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.319 s
+ALOHA: aloha creates 10 routines in  0.317 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -259,9 +259,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m3.929s
-user	0m3.539s
-sys	0m0.294s
+real	0m3.822s
+user	0m3.543s
+sys	0m0.260s
 Code generation completed in 4 seconds
 ************************************************************
 *                                                          *
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
index 4bf2198dd1..60eb101a6a 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
@@ -109,9 +109,9 @@ namespace mg5amcCpu
      * @param rndcol the pointer to the input random numbers for color selection
      * @param channelId the Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0)
      * @param mes the pointer to the output matrix elements
-     * @param goodHelOnly quit after computing good helicities?
      * @param selhel the pointer to the output selected helicities
      * @param selcol the pointer to the output selected colors
+     * @param goodHelOnly quit after computing good helicities?
      */
     void gpu_sequence( const FORTRANFPTYPE* momenta,
                        const FORTRANFPTYPE* gs,
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f
index 208149fcf6..ddc480ec63 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f
@@ -516,7 +516,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
       
       IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 #endif
-        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
 !$OMP PARALLEL
 !$OMP DO
         DO IVEC=1, VECSIZE_USED
@@ -532,7 +532,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
         ENDDO
 !$OMP END DO
 !$OMP END PARALLEL
-        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
+        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
       ENDIF
 
@@ -542,9 +542,10 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           STOP
         ENDIF
         IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
+          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
           FIRST = .FALSE.
 c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
           IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -558,22 +559,23 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           ENDIF
           WRITE (6,*) 'NGOODHEL =', NGOODHEL
           WRITE (6,*) 'NCOMB =', NCOMB
+          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
         ENDIF
-        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
         IF ( .NOT. MULTI_CHANNEL ) THEN
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ELSE
           IF( SDE_STRAT.NE.1 ) THEN
             WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
             STOP
           ENDIF
-          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
      &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ENDIF
-        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
+        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
       ENDIF
 
       IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/fcheck_sa.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/fcheck_sa.f
index 32f6c3207c..6a66bac979 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/fcheck_sa.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f
index 0413417a30..fdcc390db4 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f
@@ -191,7 +191,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 C     BEGIN CODE
 C     ----------
 
-      call counters_smatrix1_start()
       NTRY(IMIRROR)=NTRY(IMIRROR)+1
       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
       DO I=1,NEXTERNAL
@@ -297,7 +296,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         IHEL = HEL_PICKED
       ELSE
         ANS = 1D0
-        call counters_smatrix1_stop()
         RETURN
       ENDIF
       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
@@ -342,8 +340,9 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         ENDIF
       ENDIF
       ANS=ANS/DBLE(IDEN)
+
       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
-      call counters_smatrix1_stop()
+
       END
 
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/counters.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/counters.cc
index 742575a6a5..8ef58cce80 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/counters.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/counters.cc
@@ -21,26 +21,24 @@ extern "C"
 {
   // Now: fortran=-1, cudacpp=0
   // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
+  constexpr unsigned int nimplC = 3;
   constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
   const char* iimplC2TXT( int iimplC )
   {
     const int iimplF = iimplC - 1;
     switch( iimplF )
     {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
+      case -1: return "Fortran MEs"; break;
+      case +0: return "CudaCpp MEs"; break;
+      case +1: return "CudaCpp HEL"; break;
       default: assert( false ); break;
     }
   }
 
   static mgOnGpu::Timer<TIMERTYPE> program_timer;
   static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
   static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
   static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
   static int smatrix1multi_counter[nimplC] = { 0 };
 
   void counters_initialise_()
@@ -49,19 +47,6 @@ extern "C"
     return;
   }
 
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
   void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
   {
     const unsigned int iimplC = iimplF2C( *iimplF );
@@ -86,13 +71,23 @@ extern "C"
     printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
     printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
     for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
+    {
       if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+      {
+        if( iimplC < nimplC - 1 ) // MEs
+          printf( " [COUNTERS] %11s      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
+                  iimplC2TXT( iimplC ),
+                  iimplC + 1,
+                  smatrix1multi_totaltime[iimplC],
+                  smatrix1multi_counter[iimplC],
+                  smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+        else
+          printf( " [COUNTERS] %11s      ( %1d ) : %9.4fs\n",
+                  iimplC2TXT( iimplC ),
+                  iimplC + 1,
+                  smatrix1multi_totaltime[iimplC] );
+      }
+    }
     return;
   }
 }
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
index 359f16c029..9cff5e1a60 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
@@ -116,15 +116,31 @@ override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 # Set HIP_HOME from the path to hipcc, if it exists
 override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
 
-# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists
-# (FIXME? Is there any equivalent of NVTX FOR HIP? What should be configured if both CUDA and HIP are installed?)
-ifneq ($(CUDA_HOME),)
-  USE_NVTX ?=-DUSE_NVTX
-  CUDA_INC = -I$(CUDA_HOME)/include/
+# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
+ifeq ($(CUDA_HOME),)
+  # CUDA_HOME is empty (nvcc not found)
+  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/),)
+  # CUDA_HOME is defined (nvcc was found) but $(CUDA_HOME)/include/ does not exist?
+  override CUDA_INC=
 else
+  CUDA_INC = -I$(CUDA_HOME)/include/
+endif
+###$(info CUDA_INC=$(CUDA_INC))
+
+# Configure NVTX if a CUDA include directory exists and NVTX headers exist (see #965)
+ifeq ($(CUDA_INC),)
+  # $(CUDA_HOME)/include/ does not exist
   override USE_NVTX=
-  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/nvtx3/nvToolsExt.h),)
+  # $(CUDA_HOME)/include/ exists but NVTX headers do not exist?
+  override USE_NVTX=
+else
+  # $(CUDA_HOME)/include/nvtx.h exists: use NVTX
+  # (NB: the option to disable NVTX if 'USE_NVTX=' is defined has been removed)
+  override USE_NVTX=-DUSE_NVTX
 endif
+###$(info USE_NVTX=$(USE_NVTX))
 
 # NB: NEW LOGIC FOR ENABLING AND DISABLING CUDA OR HIP BUILDS (AV Feb-Mar 2024)
 # - In the old implementation, by default the C++ targets for one specific AVX were always built together with either CUDA or HIP.
@@ -424,13 +440,18 @@ endif
 # (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...)
 ifeq ($(HASCURAND),)
   ifeq ($(GPUCC),) # CPU-only build
-    ifneq ($(CUDA_HOME),)
+    ifeq ($(CUDA_INC),)
+      # $(CUDA_HOME)/include/ does not exist (see #965)
+      override HASCURAND = hasNoCurand
+    else ifeq ($(wildcard $(CUDA_HOME)/include/curand.h),)
+      # $(CUDA_HOME)/include/ exists but CURAND headers do not exist? (see #965)
+      override HASCURAND = hasNoCurand
+    else
       # By default, assume that curand is installed if a CUDA installation exists
       override HASCURAND = hasCurand
-    else
-      override HASCURAND = hasNoCurand
     endif
   else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    # By default, assume that curand is installed if a CUDA build is requested
     override HASCURAND = hasCurand
   else # non-Nvidia GPU build
     override HASCURAND = hasNoCurand
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc
index 8a5b8be9c0..99efcb1dbe 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc
@@ -83,6 +83,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_( CppObjectInFortran** ppbridge,
                          const FORTRANFPTYPE* momenta,
@@ -92,18 +93,20 @@ extern "C"
                          const unsigned int* pchannelId,
                          FORTRANFPTYPE* mes,
                          int* selhel,
-                         int* selcol )
+                         int* selcol,
+                         const bool* pgoodHelOnly )
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
+    //printf("fbridgesequence_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
 #ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
-    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #else
     // Use the host/CPU implementation in the C++ library
     // (there is no device implementation in this library)
-    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #endif
   }
 
@@ -119,6 +122,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
                                         const FORTRANFPTYPE* momenta,
@@ -127,9 +131,11 @@ extern "C"
                                         const FORTRANFPTYPE* rndcol,
                                         FORTRANFPTYPE* mes,
                                         int* selhel,
-                                        int* selcol )
+                                        int* selcol,
+                                        const bool* pgoodHelOnly )
   {
-    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol );
+    //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
+    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly );
   }
 
   /**
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.inc
index 422aa67cf9..a28622cdb6 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.inc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.inc
@@ -40,10 +40,11 @@ C - CHANID:  the input Feynman diagram to enhance in multi-channel mode if 1 to
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -53,6 +54,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE
       END INTERFACE
 
@@ -66,10 +68,11 @@ C - RNDCOL:  the input random number Fortran array for color selection
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -78,6 +81,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL
       END INTERFACE
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
index ec446c348d..5c8b6b0535 100644
--- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005393266677856445 [0m
+[1;32mDEBUG: model prefixing  takes 0.0053234100341796875 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.158 s
+1 processes with 123 diagrams generated in 0.157 s
 Total: 1 processes with 123 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg
 Load PLUGIN.CUDACPP_OUTPUT
@@ -178,14 +178,14 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.429 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.430 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.321 s
+ALOHA: aloha creates 5 routines in  0.322 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -208,7 +208,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. 
 quit
 
-real	0m1.690s
-user	0m1.392s
-sys	0m0.051s
-Code generation completed in 2 seconds
+real	0m1.496s
+user	0m1.376s
+sys	0m0.058s
+Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h
index 4bf2198dd1..60eb101a6a 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h
@@ -109,9 +109,9 @@ namespace mg5amcCpu
      * @param rndcol the pointer to the input random numbers for color selection
      * @param channelId the Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0)
      * @param mes the pointer to the output matrix elements
-     * @param goodHelOnly quit after computing good helicities?
      * @param selhel the pointer to the output selected helicities
      * @param selcol the pointer to the output selected colors
+     * @param goodHelOnly quit after computing good helicities?
      */
     void gpu_sequence( const FORTRANFPTYPE* momenta,
                        const FORTRANFPTYPE* gs,
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/fcheck_sa.f b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/fcheck_sa.f
index 32f6c3207c..6a66bac979 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/fcheck_sa.f
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
index 359f16c029..9cff5e1a60 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
@@ -116,15 +116,31 @@ override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 # Set HIP_HOME from the path to hipcc, if it exists
 override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
 
-# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists
-# (FIXME? Is there any equivalent of NVTX FOR HIP? What should be configured if both CUDA and HIP are installed?)
-ifneq ($(CUDA_HOME),)
-  USE_NVTX ?=-DUSE_NVTX
-  CUDA_INC = -I$(CUDA_HOME)/include/
+# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
+ifeq ($(CUDA_HOME),)
+  # CUDA_HOME is empty (nvcc not found)
+  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/),)
+  # CUDA_HOME is defined (nvcc was found) but $(CUDA_HOME)/include/ does not exist?
+  override CUDA_INC=
 else
+  CUDA_INC = -I$(CUDA_HOME)/include/
+endif
+###$(info CUDA_INC=$(CUDA_INC))
+
+# Configure NVTX if a CUDA include directory exists and NVTX headers exist (see #965)
+ifeq ($(CUDA_INC),)
+  # $(CUDA_HOME)/include/ does not exist
   override USE_NVTX=
-  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/nvtx3/nvToolsExt.h),)
+  # $(CUDA_HOME)/include/ exists but NVTX headers do not exist?
+  override USE_NVTX=
+else
+  # $(CUDA_HOME)/include/nvtx.h exists: use NVTX
+  # (NB: the option to disable NVTX if 'USE_NVTX=' is defined has been removed)
+  override USE_NVTX=-DUSE_NVTX
 endif
+###$(info USE_NVTX=$(USE_NVTX))
 
 # NB: NEW LOGIC FOR ENABLING AND DISABLING CUDA OR HIP BUILDS (AV Feb-Mar 2024)
 # - In the old implementation, by default the C++ targets for one specific AVX were always built together with either CUDA or HIP.
@@ -424,13 +440,18 @@ endif
 # (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...)
 ifeq ($(HASCURAND),)
   ifeq ($(GPUCC),) # CPU-only build
-    ifneq ($(CUDA_HOME),)
+    ifeq ($(CUDA_INC),)
+      # $(CUDA_HOME)/include/ does not exist (see #965)
+      override HASCURAND = hasNoCurand
+    else ifeq ($(wildcard $(CUDA_HOME)/include/curand.h),)
+      # $(CUDA_HOME)/include/ exists but CURAND headers do not exist? (see #965)
+      override HASCURAND = hasNoCurand
+    else
       # By default, assume that curand is installed if a CUDA installation exists
       override HASCURAND = hasCurand
-    else
-      override HASCURAND = hasNoCurand
     endif
   else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    # By default, assume that curand is installed if a CUDA build is requested
     override HASCURAND = hasCurand
   else # non-Nvidia GPU build
     override HASCURAND = hasNoCurand
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc
index 8a5b8be9c0..99efcb1dbe 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc
@@ -83,6 +83,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_( CppObjectInFortran** ppbridge,
                          const FORTRANFPTYPE* momenta,
@@ -92,18 +93,20 @@ extern "C"
                          const unsigned int* pchannelId,
                          FORTRANFPTYPE* mes,
                          int* selhel,
-                         int* selcol )
+                         int* selcol,
+                         const bool* pgoodHelOnly )
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
+    //printf("fbridgesequence_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
 #ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
-    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #else
     // Use the host/CPU implementation in the C++ library
     // (there is no device implementation in this library)
-    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #endif
   }
 
@@ -119,6 +122,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
                                         const FORTRANFPTYPE* momenta,
@@ -127,9 +131,11 @@ extern "C"
                                         const FORTRANFPTYPE* rndcol,
                                         FORTRANFPTYPE* mes,
                                         int* selhel,
-                                        int* selcol )
+                                        int* selcol,
+                                        const bool* pgoodHelOnly )
   {
-    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol );
+    //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
+    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly );
   }
 
   /**
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.inc
index 422aa67cf9..a28622cdb6 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.inc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.inc
@@ -40,10 +40,11 @@ C - CHANID:  the input Feynman diagram to enhance in multi-channel mode if 1 to
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -53,6 +54,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE
       END INTERFACE
 
@@ -66,10 +68,11 @@ C - RNDCOL:  the input random number Fortran array for color selection
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -78,6 +81,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL
       END INTERFACE
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
index 80b849a95d..cf81051351 100644
--- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005666971206665039 [0m
+[1;32mDEBUG: model prefixing  takes 0.005418062210083008 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.906 s
+1 processes with 1240 diagrams generated in 1.889 s
 Total: 1 processes with 1240 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32
 Load PLUGIN.CUDACPP_OUTPUT
@@ -180,7 +180,7 @@ INFO: Creating files in directory P1_gg_ttxggg
 INFO: Computing Color-Flow optimization [15120 term] 
 INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1152][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7faef64aaa00> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f22d74c8b50> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -199,22 +199,22 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttxggg
 [1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 945 [1;30m[model_handling.py at line 1520][0m [0m
 [1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [1;30m[model_handling.py at line 1544][0m [0m
 [1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [1;30m[model_handling.py at line 1545][0m [0m
-Generated helas calls for 1 subprocesses (1240 diagrams) in 6.619 s
-Wrote files for 2281 helas calls in 18.549 s
+Generated helas calls for 1 subprocesses (1240 diagrams) in 6.527 s
+Wrote files for 2281 helas calls in 18.453 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.325 s
+ALOHA: aloha creates 5 routines in  0.318 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.369 s
+ALOHA: aloha creates 10 routines in  0.355 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -261,9 +261,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m32.883s
-user	0m32.292s
-sys	0m0.459s
+real	0m32.580s
+user	0m32.015s
+sys	0m0.455s
 Code generation completed in 33 seconds
 ************************************************************
 *                                                          *
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h
index 4bf2198dd1..60eb101a6a 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h
@@ -109,9 +109,9 @@ namespace mg5amcCpu
      * @param rndcol the pointer to the input random numbers for color selection
      * @param channelId the Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0)
      * @param mes the pointer to the output matrix elements
-     * @param goodHelOnly quit after computing good helicities?
      * @param selhel the pointer to the output selected helicities
      * @param selcol the pointer to the output selected colors
+     * @param goodHelOnly quit after computing good helicities?
      */
     void gpu_sequence( const FORTRANFPTYPE* momenta,
                        const FORTRANFPTYPE* gs,
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f
index 7c94a0776f..5f55c4daed 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f
@@ -516,7 +516,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
       
       IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 #endif
-        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
 !$OMP PARALLEL
 !$OMP DO
         DO IVEC=1, VECSIZE_USED
@@ -532,7 +532,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
         ENDDO
 !$OMP END DO
 !$OMP END PARALLEL
-        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
+        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
       ENDIF
 
@@ -542,9 +542,10 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           STOP
         ENDIF
         IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
+          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
           FIRST = .FALSE.
 c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
           IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -558,22 +559,23 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           ENDIF
           WRITE (6,*) 'NGOODHEL =', NGOODHEL
           WRITE (6,*) 'NCOMB =', NCOMB
+          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
         ENDIF
-        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
         IF ( .NOT. MULTI_CHANNEL ) THEN
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ELSE
           IF( SDE_STRAT.NE.1 ) THEN
             WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
             STOP
           ENDIF
-          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
      &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ENDIF
-        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
+        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
       ENDIF
 
       IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/fcheck_sa.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/fcheck_sa.f
index 7722c3af16..870c890410 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/fcheck_sa.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f
index b9e6d3613f..aefbff4b80 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f
@@ -255,7 +255,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 C     BEGIN CODE
 C     ----------
 
-      call counters_smatrix1_start()
       NTRY(IMIRROR)=NTRY(IMIRROR)+1
       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
       DO I=1,NEXTERNAL
@@ -361,7 +360,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         IHEL = HEL_PICKED
       ELSE
         ANS = 1D0
-        call counters_smatrix1_stop()
         RETURN
       ENDIF
       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
@@ -406,8 +404,9 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         ENDIF
       ENDIF
       ANS=ANS/DBLE(IDEN)
+
       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
-      call counters_smatrix1_stop()
+
       END
 
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/counters.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/counters.cc
index 742575a6a5..8ef58cce80 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/counters.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/counters.cc
@@ -21,26 +21,24 @@ extern "C"
 {
   // Now: fortran=-1, cudacpp=0
   // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
+  constexpr unsigned int nimplC = 3;
   constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
   const char* iimplC2TXT( int iimplC )
   {
     const int iimplF = iimplC - 1;
     switch( iimplF )
     {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
+      case -1: return "Fortran MEs"; break;
+      case +0: return "CudaCpp MEs"; break;
+      case +1: return "CudaCpp HEL"; break;
       default: assert( false ); break;
     }
   }
 
   static mgOnGpu::Timer<TIMERTYPE> program_timer;
   static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
   static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
   static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
   static int smatrix1multi_counter[nimplC] = { 0 };
 
   void counters_initialise_()
@@ -49,19 +47,6 @@ extern "C"
     return;
   }
 
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
   void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
   {
     const unsigned int iimplC = iimplF2C( *iimplF );
@@ -86,13 +71,23 @@ extern "C"
     printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
     printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
     for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
+    {
       if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+      {
+        if( iimplC < nimplC - 1 ) // MEs
+          printf( " [COUNTERS] %11s      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
+                  iimplC2TXT( iimplC ),
+                  iimplC + 1,
+                  smatrix1multi_totaltime[iimplC],
+                  smatrix1multi_counter[iimplC],
+                  smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+        else
+          printf( " [COUNTERS] %11s      ( %1d ) : %9.4fs\n",
+                  iimplC2TXT( iimplC ),
+                  iimplC + 1,
+                  smatrix1multi_totaltime[iimplC] );
+      }
+    }
     return;
   }
 }
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
index 359f16c029..9cff5e1a60 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
@@ -116,15 +116,31 @@ override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 # Set HIP_HOME from the path to hipcc, if it exists
 override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
 
-# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists
-# (FIXME? Is there any equivalent of NVTX FOR HIP? What should be configured if both CUDA and HIP are installed?)
-ifneq ($(CUDA_HOME),)
-  USE_NVTX ?=-DUSE_NVTX
-  CUDA_INC = -I$(CUDA_HOME)/include/
+# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
+ifeq ($(CUDA_HOME),)
+  # CUDA_HOME is empty (nvcc not found)
+  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/),)
+  # CUDA_HOME is defined (nvcc was found) but $(CUDA_HOME)/include/ does not exist?
+  override CUDA_INC=
 else
+  CUDA_INC = -I$(CUDA_HOME)/include/
+endif
+###$(info CUDA_INC=$(CUDA_INC))
+
+# Configure NVTX if a CUDA include directory exists and NVTX headers exist (see #965)
+ifeq ($(CUDA_INC),)
+  # $(CUDA_HOME)/include/ does not exist
   override USE_NVTX=
-  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/nvtx3/nvToolsExt.h),)
+  # $(CUDA_HOME)/include/ exists but NVTX headers do not exist?
+  override USE_NVTX=
+else
+  # $(CUDA_HOME)/include/nvtx.h exists: use NVTX
+  # (NB: the option to disable NVTX if 'USE_NVTX=' is defined has been removed)
+  override USE_NVTX=-DUSE_NVTX
 endif
+###$(info USE_NVTX=$(USE_NVTX))
 
 # NB: NEW LOGIC FOR ENABLING AND DISABLING CUDA OR HIP BUILDS (AV Feb-Mar 2024)
 # - In the old implementation, by default the C++ targets for one specific AVX were always built together with either CUDA or HIP.
@@ -424,13 +440,18 @@ endif
 # (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...)
 ifeq ($(HASCURAND),)
   ifeq ($(GPUCC),) # CPU-only build
-    ifneq ($(CUDA_HOME),)
+    ifeq ($(CUDA_INC),)
+      # $(CUDA_HOME)/include/ does not exist (see #965)
+      override HASCURAND = hasNoCurand
+    else ifeq ($(wildcard $(CUDA_HOME)/include/curand.h),)
+      # $(CUDA_HOME)/include/ exists but CURAND headers do not exist? (see #965)
+      override HASCURAND = hasNoCurand
+    else
       # By default, assume that curand is installed if a CUDA installation exists
       override HASCURAND = hasCurand
-    else
-      override HASCURAND = hasNoCurand
     endif
   else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    # By default, assume that curand is installed if a CUDA build is requested
     override HASCURAND = hasCurand
   else # non-Nvidia GPU build
     override HASCURAND = hasNoCurand
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc
index 8a5b8be9c0..99efcb1dbe 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc
@@ -83,6 +83,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_( CppObjectInFortran** ppbridge,
                          const FORTRANFPTYPE* momenta,
@@ -92,18 +93,20 @@ extern "C"
                          const unsigned int* pchannelId,
                          FORTRANFPTYPE* mes,
                          int* selhel,
-                         int* selcol )
+                         int* selcol,
+                         const bool* pgoodHelOnly )
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
+    //printf("fbridgesequence_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
 #ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
-    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #else
     // Use the host/CPU implementation in the C++ library
     // (there is no device implementation in this library)
-    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #endif
   }
 
@@ -119,6 +122,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
                                         const FORTRANFPTYPE* momenta,
@@ -127,9 +131,11 @@ extern "C"
                                         const FORTRANFPTYPE* rndcol,
                                         FORTRANFPTYPE* mes,
                                         int* selhel,
-                                        int* selcol )
+                                        int* selcol,
+                                        const bool* pgoodHelOnly )
   {
-    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol );
+    //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
+    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly );
   }
 
   /**
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.inc
index 422aa67cf9..a28622cdb6 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.inc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.inc
@@ -40,10 +40,11 @@ C - CHANID:  the input Feynman diagram to enhance in multi-channel mode if 1 to
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -53,6 +54,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE
       END INTERFACE
 
@@ -66,10 +68,11 @@ C - RNDCOL:  the input random number Fortran array for color selection
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -78,6 +81,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL
       END INTERFACE
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
index 9fa53f086d..70ece972f5 100644
--- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005432844161987305 [0m
+[1;32mDEBUG: model prefixing  takes 0.005778312683105469 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.902 s
+1 processes with 1240 diagrams generated in 1.872 s
 Total: 1 processes with 1240 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg
 Load PLUGIN.CUDACPP_OUTPUT
@@ -178,14 +178,14 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. 
-Generated helas calls for 1 subprocesses (1240 diagrams) in 6.640 s
+Generated helas calls for 1 subprocesses (1240 diagrams) in 6.585 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.354 s
+ALOHA: aloha creates 5 routines in  0.348 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -208,7 +208,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. 
 quit
 
-real	0m13.132s
-user	0m12.955s
-sys	0m0.111s
-Code generation completed in 13 seconds
+real	0m13.103s
+user	0m12.928s
+sys	0m0.109s
+Code generation completed in 14 seconds
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h
index 4bf2198dd1..60eb101a6a 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h
@@ -109,9 +109,9 @@ namespace mg5amcCpu
      * @param rndcol the pointer to the input random numbers for color selection
      * @param channelId the Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0)
      * @param mes the pointer to the output matrix elements
-     * @param goodHelOnly quit after computing good helicities?
      * @param selhel the pointer to the output selected helicities
      * @param selcol the pointer to the output selected colors
+     * @param goodHelOnly quit after computing good helicities?
      */
     void gpu_sequence( const FORTRANFPTYPE* momenta,
                        const FORTRANFPTYPE* gs,
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/fcheck_sa.f b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/fcheck_sa.f
index 7722c3af16..870c890410 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/fcheck_sa.f
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
index 359f16c029..9cff5e1a60 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
@@ -116,15 +116,31 @@ override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 # Set HIP_HOME from the path to hipcc, if it exists
 override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
 
-# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists
-# (FIXME? Is there any equivalent of NVTX FOR HIP? What should be configured if both CUDA and HIP are installed?)
-ifneq ($(CUDA_HOME),)
-  USE_NVTX ?=-DUSE_NVTX
-  CUDA_INC = -I$(CUDA_HOME)/include/
+# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
+ifeq ($(CUDA_HOME),)
+  # CUDA_HOME is empty (nvcc not found)
+  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/),)
+  # CUDA_HOME is defined (nvcc was found) but $(CUDA_HOME)/include/ does not exist?
+  override CUDA_INC=
 else
+  CUDA_INC = -I$(CUDA_HOME)/include/
+endif
+###$(info CUDA_INC=$(CUDA_INC))
+
+# Configure NVTX if a CUDA include directory exists and NVTX headers exist (see #965)
+ifeq ($(CUDA_INC),)
+  # $(CUDA_HOME)/include/ does not exist
   override USE_NVTX=
-  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/nvtx3/nvToolsExt.h),)
+  # $(CUDA_HOME)/include/ exists but NVTX headers do not exist?
+  override USE_NVTX=
+else
+  # $(CUDA_HOME)/include/nvtx.h exists: use NVTX
+  # (NB: the option to disable NVTX if 'USE_NVTX=' is defined has been removed)
+  override USE_NVTX=-DUSE_NVTX
 endif
+###$(info USE_NVTX=$(USE_NVTX))
 
 # NB: NEW LOGIC FOR ENABLING AND DISABLING CUDA OR HIP BUILDS (AV Feb-Mar 2024)
 # - In the old implementation, by default the C++ targets for one specific AVX were always built together with either CUDA or HIP.
@@ -424,13 +440,18 @@ endif
 # (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...)
 ifeq ($(HASCURAND),)
   ifeq ($(GPUCC),) # CPU-only build
-    ifneq ($(CUDA_HOME),)
+    ifeq ($(CUDA_INC),)
+      # $(CUDA_HOME)/include/ does not exist (see #965)
+      override HASCURAND = hasNoCurand
+    else ifeq ($(wildcard $(CUDA_HOME)/include/curand.h),)
+      # $(CUDA_HOME)/include/ exists but CURAND headers do not exist? (see #965)
+      override HASCURAND = hasNoCurand
+    else
       # By default, assume that curand is installed if a CUDA installation exists
       override HASCURAND = hasCurand
-    else
-      override HASCURAND = hasNoCurand
     endif
   else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    # By default, assume that curand is installed if a CUDA build is requested
     override HASCURAND = hasCurand
   else # non-Nvidia GPU build
     override HASCURAND = hasNoCurand
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc
index 8a5b8be9c0..99efcb1dbe 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc
@@ -83,6 +83,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_( CppObjectInFortran** ppbridge,
                          const FORTRANFPTYPE* momenta,
@@ -92,18 +93,20 @@ extern "C"
                          const unsigned int* pchannelId,
                          FORTRANFPTYPE* mes,
                          int* selhel,
-                         int* selcol )
+                         int* selcol,
+                         const bool* pgoodHelOnly )
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
+    //printf("fbridgesequence_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
 #ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
-    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #else
     // Use the host/CPU implementation in the C++ library
     // (there is no device implementation in this library)
-    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #endif
   }
 
@@ -119,6 +122,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
                                         const FORTRANFPTYPE* momenta,
@@ -127,9 +131,11 @@ extern "C"
                                         const FORTRANFPTYPE* rndcol,
                                         FORTRANFPTYPE* mes,
                                         int* selhel,
-                                        int* selcol )
+                                        int* selcol,
+                                        const bool* pgoodHelOnly )
   {
-    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol );
+    //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
+    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly );
   }
 
   /**
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.inc
index 422aa67cf9..a28622cdb6 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.inc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.inc
@@ -40,10 +40,11 @@ C - CHANID:  the input Feynman diagram to enhance in multi-channel mode if 1 to
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -53,6 +54,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE
       END INTERFACE
 
@@ -66,10 +68,11 @@ C - RNDCOL:  the input random number Fortran array for color selection
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -78,6 +81,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL
       END INTERFACE
 
diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
index f5c94e00cd..cb97eb9e35 100644
--- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005633831024169922 [0m
+[1;32mDEBUG: model prefixing  takes 0.005686521530151367 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.080 s
+8 processes with 40 diagrams generated in 0.076 s
 Total: 8 processes with 40 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32
 Load PLUGIN.CUDACPP_OUTPUT
@@ -201,7 +201,7 @@ INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~
 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Creating files in directory P1_gu_ttxu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1152][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f38706e0910> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f9e8f1798b0> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -222,7 +222,7 @@ INFO: Finding symmetric diagrams for subprocess group gu_ttxu
 [1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1545][0m [0m
 INFO: Creating files in directory P1_gux_ttxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1152][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f38705108e0> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f9e8efaa1c0> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -241,7 +241,7 @@ INFO: Finding symmetric diagrams for subprocess group gux_ttxux
 [1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1520][0m [0m
 [1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1544][0m [0m
 [1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1545][0m [0m
-Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s
+Generated helas calls for 2 subprocesses (10 diagrams) in 0.030 s
 Wrote files for 32 helas calls in 0.249 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
@@ -250,7 +250,7 @@ ALOHA: aloha creates 2 routines in  0.146 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.134 s
+ALOHA: aloha creates 4 routines in  0.133 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -302,10 +302,10 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.313s
-user	0m1.990s
-sys	0m0.293s
-Code generation completed in 2 seconds
+real	0m3.389s
+user	0m1.964s
+sys	0m0.295s
+Code generation completed in 3 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h
index 4bf2198dd1..60eb101a6a 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h
@@ -109,9 +109,9 @@ namespace mg5amcCpu
      * @param rndcol the pointer to the input random numbers for color selection
      * @param channelId the Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0)
      * @param mes the pointer to the output matrix elements
-     * @param goodHelOnly quit after computing good helicities?
      * @param selhel the pointer to the output selected helicities
      * @param selcol the pointer to the output selected colors
+     * @param goodHelOnly quit after computing good helicities?
      */
     void gpu_sequence( const FORTRANFPTYPE* momenta,
                        const FORTRANFPTYPE* gs,
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f
index 2c11f53b89..3d7efb5585 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f
@@ -560,7 +560,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
       
       IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 #endif
-        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
 !$OMP PARALLEL
 !$OMP DO
         DO IVEC=1, VECSIZE_USED
@@ -576,7 +576,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
         ENDDO
 !$OMP END DO
 !$OMP END PARALLEL
-        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
+        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
       ENDIF
 
@@ -586,9 +586,10 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           STOP
         ENDIF
         IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
+          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
           FIRST = .FALSE.
 c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
           IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -602,22 +603,23 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           ENDIF
           WRITE (6,*) 'NGOODHEL =', NGOODHEL
           WRITE (6,*) 'NCOMB =', NCOMB
+          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
         ENDIF
-        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
         IF ( .NOT. MULTI_CHANNEL ) THEN
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ELSE
           IF( SDE_STRAT.NE.1 ) THEN
             WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
             STOP
           ENDIF
-          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
      &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ENDIF
-        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
+        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
       ENDIF
 
       IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/fcheck_sa.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/fcheck_sa.f
index 9170a32a19..cb7efdfbcf 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/fcheck_sa.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f
index b3c4ec75f6..c1fb026c9e 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f
@@ -162,7 +162,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 C     BEGIN CODE
 C     ----------
 
-      call counters_smatrix1_start()
       NTRY(IMIRROR)=NTRY(IMIRROR)+1
       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
       DO I=1,NEXTERNAL
@@ -281,7 +280,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         IHEL = HEL_PICKED
       ELSE
         ANS = 1D0
-        call counters_smatrix1_stop()
         RETURN
       ENDIF
       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
@@ -326,8 +324,9 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         ENDIF
       ENDIF
       ANS=ANS/DBLE(IDEN)
+
       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
-      call counters_smatrix1_stop()
+
       END
 
 
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f
index d829a73049..d65bac7611 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f
@@ -560,7 +560,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
       
       IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 #endif
-        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
 !$OMP PARALLEL
 !$OMP DO
         DO IVEC=1, VECSIZE_USED
@@ -576,7 +576,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
         ENDDO
 !$OMP END DO
 !$OMP END PARALLEL
-        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
+        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
       ENDIF
 
@@ -586,9 +586,10 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           STOP
         ENDIF
         IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
+          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
           FIRST = .FALSE.
 c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
           IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -602,22 +603,23 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           ENDIF
           WRITE (6,*) 'NGOODHEL =', NGOODHEL
           WRITE (6,*) 'NCOMB =', NCOMB
+          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
         ENDIF
-        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
         IF ( .NOT. MULTI_CHANNEL ) THEN
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ELSE
           IF( SDE_STRAT.NE.1 ) THEN
             WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
             STOP
           ENDIF
-          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
      &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ENDIF
-        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
+        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
       ENDIF
 
       IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/fcheck_sa.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/fcheck_sa.f
index 9170a32a19..cb7efdfbcf 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/fcheck_sa.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f
index 20ec98ad2f..bbe2b8626e 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f
@@ -162,7 +162,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 C     BEGIN CODE
 C     ----------
 
-      call counters_smatrix1_start()
       NTRY(IMIRROR)=NTRY(IMIRROR)+1
       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
       DO I=1,NEXTERNAL
@@ -281,7 +280,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         IHEL = HEL_PICKED
       ELSE
         ANS = 1D0
-        call counters_smatrix1_stop()
         RETURN
       ENDIF
       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
@@ -326,8 +324,9 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         ENDIF
       ENDIF
       ANS=ANS/DBLE(IDEN)
+
       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
-      call counters_smatrix1_stop()
+
       END
 
 
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/counters.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/counters.cc
index 742575a6a5..8ef58cce80 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/counters.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/counters.cc
@@ -21,26 +21,24 @@ extern "C"
 {
   // Now: fortran=-1, cudacpp=0
   // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
+  constexpr unsigned int nimplC = 3;
   constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
   const char* iimplC2TXT( int iimplC )
   {
     const int iimplF = iimplC - 1;
     switch( iimplF )
     {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
+      case -1: return "Fortran MEs"; break;
+      case +0: return "CudaCpp MEs"; break;
+      case +1: return "CudaCpp HEL"; break;
       default: assert( false ); break;
     }
   }
 
   static mgOnGpu::Timer<TIMERTYPE> program_timer;
   static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
   static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
   static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
   static int smatrix1multi_counter[nimplC] = { 0 };
 
   void counters_initialise_()
@@ -49,19 +47,6 @@ extern "C"
     return;
   }
 
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
   void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
   {
     const unsigned int iimplC = iimplF2C( *iimplF );
@@ -86,13 +71,23 @@ extern "C"
     printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
     printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
     for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
+    {
       if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+      {
+        if( iimplC < nimplC - 1 ) // MEs
+          printf( " [COUNTERS] %11s      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
+                  iimplC2TXT( iimplC ),
+                  iimplC + 1,
+                  smatrix1multi_totaltime[iimplC],
+                  smatrix1multi_counter[iimplC],
+                  smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+        else
+          printf( " [COUNTERS] %11s      ( %1d ) : %9.4fs\n",
+                  iimplC2TXT( iimplC ),
+                  iimplC + 1,
+                  smatrix1multi_totaltime[iimplC] );
+      }
+    }
     return;
   }
 }
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
index 359f16c029..9cff5e1a60 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
@@ -116,15 +116,31 @@ override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 # Set HIP_HOME from the path to hipcc, if it exists
 override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
 
-# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists
-# (FIXME? Is there any equivalent of NVTX FOR HIP? What should be configured if both CUDA and HIP are installed?)
-ifneq ($(CUDA_HOME),)
-  USE_NVTX ?=-DUSE_NVTX
-  CUDA_INC = -I$(CUDA_HOME)/include/
+# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
+ifeq ($(CUDA_HOME),)
+  # CUDA_HOME is empty (nvcc not found)
+  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/),)
+  # CUDA_HOME is defined (nvcc was found) but $(CUDA_HOME)/include/ does not exist?
+  override CUDA_INC=
 else
+  CUDA_INC = -I$(CUDA_HOME)/include/
+endif
+###$(info CUDA_INC=$(CUDA_INC))
+
+# Configure NVTX if a CUDA include directory exists and NVTX headers exist (see #965)
+ifeq ($(CUDA_INC),)
+  # $(CUDA_HOME)/include/ does not exist
   override USE_NVTX=
-  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/nvtx3/nvToolsExt.h),)
+  # $(CUDA_HOME)/include/ exists but NVTX headers do not exist?
+  override USE_NVTX=
+else
+  # $(CUDA_HOME)/include/nvtx.h exists: use NVTX
+  # (NB: the option to disable NVTX if 'USE_NVTX=' is defined has been removed)
+  override USE_NVTX=-DUSE_NVTX
 endif
+###$(info USE_NVTX=$(USE_NVTX))
 
 # NB: NEW LOGIC FOR ENABLING AND DISABLING CUDA OR HIP BUILDS (AV Feb-Mar 2024)
 # - In the old implementation, by default the C++ targets for one specific AVX were always built together with either CUDA or HIP.
@@ -424,13 +440,18 @@ endif
 # (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...)
 ifeq ($(HASCURAND),)
   ifeq ($(GPUCC),) # CPU-only build
-    ifneq ($(CUDA_HOME),)
+    ifeq ($(CUDA_INC),)
+      # $(CUDA_HOME)/include/ does not exist (see #965)
+      override HASCURAND = hasNoCurand
+    else ifeq ($(wildcard $(CUDA_HOME)/include/curand.h),)
+      # $(CUDA_HOME)/include/ exists but CURAND headers do not exist? (see #965)
+      override HASCURAND = hasNoCurand
+    else
       # By default, assume that curand is installed if a CUDA installation exists
       override HASCURAND = hasCurand
-    else
-      override HASCURAND = hasNoCurand
     endif
   else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    # By default, assume that curand is installed if a CUDA build is requested
     override HASCURAND = hasCurand
   else # non-Nvidia GPU build
     override HASCURAND = hasNoCurand
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc
index 8a5b8be9c0..99efcb1dbe 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc
@@ -83,6 +83,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_( CppObjectInFortran** ppbridge,
                          const FORTRANFPTYPE* momenta,
@@ -92,18 +93,20 @@ extern "C"
                          const unsigned int* pchannelId,
                          FORTRANFPTYPE* mes,
                          int* selhel,
-                         int* selcol )
+                         int* selcol,
+                         const bool* pgoodHelOnly )
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
+    //printf("fbridgesequence_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
 #ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
-    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #else
     // Use the host/CPU implementation in the C++ library
     // (there is no device implementation in this library)
-    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #endif
   }
 
@@ -119,6 +122,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
                                         const FORTRANFPTYPE* momenta,
@@ -127,9 +131,11 @@ extern "C"
                                         const FORTRANFPTYPE* rndcol,
                                         FORTRANFPTYPE* mes,
                                         int* selhel,
-                                        int* selcol )
+                                        int* selcol,
+                                        const bool* pgoodHelOnly )
   {
-    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol );
+    //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
+    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly );
   }
 
   /**
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.inc
index 422aa67cf9..a28622cdb6 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.inc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.inc
@@ -40,10 +40,11 @@ C - CHANID:  the input Feynman diagram to enhance in multi-channel mode if 1 to
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -53,6 +54,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE
       END INTERFACE
 
@@ -66,10 +68,11 @@ C - RNDCOL:  the input random number Fortran array for color selection
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -78,6 +81,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL
       END INTERFACE
 
diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
index 96ced9fbc8..1548b0cef5 100644
--- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0056612491607666016 [0m
+[1;32mDEBUG: model prefixing  takes 0.005625486373901367 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.080 s
+8 processes with 40 diagrams generated in 0.077 s
 Total: 8 processes with 40 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq
 Load PLUGIN.CUDACPP_OUTPUT
@@ -210,11 +210,11 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. 
-Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s
+Generated helas calls for 2 subprocesses (10 diagrams) in 0.032 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.145 s
+ALOHA: aloha creates 2 routines in  0.144 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -230,7 +230,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. 
 quit
 
-real	0m0.656s
-user	0m0.589s
-sys	0m0.057s
+real	0m0.659s
+user	0m0.597s
+sys	0m0.049s
 Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h
index 4bf2198dd1..60eb101a6a 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h
@@ -109,9 +109,9 @@ namespace mg5amcCpu
      * @param rndcol the pointer to the input random numbers for color selection
      * @param channelId the Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0)
      * @param mes the pointer to the output matrix elements
-     * @param goodHelOnly quit after computing good helicities?
      * @param selhel the pointer to the output selected helicities
      * @param selcol the pointer to the output selected colors
+     * @param goodHelOnly quit after computing good helicities?
      */
     void gpu_sequence( const FORTRANFPTYPE* momenta,
                        const FORTRANFPTYPE* gs,
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/fcheck_sa.f b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/fcheck_sa.f
index 9170a32a19..cb7efdfbcf 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/fcheck_sa.f
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/fcheck_sa.f b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/fcheck_sa.f
index 9170a32a19..cb7efdfbcf 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/fcheck_sa.f
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
index 359f16c029..9cff5e1a60 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
@@ -116,15 +116,31 @@ override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 # Set HIP_HOME from the path to hipcc, if it exists
 override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
 
-# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists
-# (FIXME? Is there any equivalent of NVTX FOR HIP? What should be configured if both CUDA and HIP are installed?)
-ifneq ($(CUDA_HOME),)
-  USE_NVTX ?=-DUSE_NVTX
-  CUDA_INC = -I$(CUDA_HOME)/include/
+# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
+ifeq ($(CUDA_HOME),)
+  # CUDA_HOME is empty (nvcc not found)
+  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/),)
+  # CUDA_HOME is defined (nvcc was found) but $(CUDA_HOME)/include/ does not exist?
+  override CUDA_INC=
 else
+  CUDA_INC = -I$(CUDA_HOME)/include/
+endif
+###$(info CUDA_INC=$(CUDA_INC))
+
+# Configure NVTX if a CUDA include directory exists and NVTX headers exist (see #965)
+ifeq ($(CUDA_INC),)
+  # $(CUDA_HOME)/include/ does not exist
   override USE_NVTX=
-  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/nvtx3/nvToolsExt.h),)
+  # $(CUDA_HOME)/include/ exists but NVTX headers do not exist?
+  override USE_NVTX=
+else
+  # $(CUDA_HOME)/include/nvtx.h exists: use NVTX
+  # (NB: the option to disable NVTX if 'USE_NVTX=' is defined has been removed)
+  override USE_NVTX=-DUSE_NVTX
 endif
+###$(info USE_NVTX=$(USE_NVTX))
 
 # NB: NEW LOGIC FOR ENABLING AND DISABLING CUDA OR HIP BUILDS (AV Feb-Mar 2024)
 # - In the old implementation, by default the C++ targets for one specific AVX were always built together with either CUDA or HIP.
@@ -424,13 +440,18 @@ endif
 # (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...)
 ifeq ($(HASCURAND),)
   ifeq ($(GPUCC),) # CPU-only build
-    ifneq ($(CUDA_HOME),)
+    ifeq ($(CUDA_INC),)
+      # $(CUDA_HOME)/include/ does not exist (see #965)
+      override HASCURAND = hasNoCurand
+    else ifeq ($(wildcard $(CUDA_HOME)/include/curand.h),)
+      # $(CUDA_HOME)/include/ exists but CURAND headers do not exist? (see #965)
+      override HASCURAND = hasNoCurand
+    else
       # By default, assume that curand is installed if a CUDA installation exists
       override HASCURAND = hasCurand
-    else
-      override HASCURAND = hasNoCurand
     endif
   else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    # By default, assume that curand is installed if a CUDA build is requested
     override HASCURAND = hasCurand
   else # non-Nvidia GPU build
     override HASCURAND = hasNoCurand
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc
index 8a5b8be9c0..99efcb1dbe 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc
@@ -83,6 +83,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_( CppObjectInFortran** ppbridge,
                          const FORTRANFPTYPE* momenta,
@@ -92,18 +93,20 @@ extern "C"
                          const unsigned int* pchannelId,
                          FORTRANFPTYPE* mes,
                          int* selhel,
-                         int* selcol )
+                         int* selcol,
+                         const bool* pgoodHelOnly )
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
+    //printf("fbridgesequence_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
 #ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
-    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #else
     // Use the host/CPU implementation in the C++ library
     // (there is no device implementation in this library)
-    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #endif
   }
 
@@ -119,6 +122,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
                                         const FORTRANFPTYPE* momenta,
@@ -127,9 +131,11 @@ extern "C"
                                         const FORTRANFPTYPE* rndcol,
                                         FORTRANFPTYPE* mes,
                                         int* selhel,
-                                        int* selcol )
+                                        int* selcol,
+                                        const bool* pgoodHelOnly )
   {
-    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol );
+    //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
+    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly );
   }
 
   /**
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.inc
index 422aa67cf9..a28622cdb6 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.inc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.inc
@@ -40,10 +40,11 @@ C - CHANID:  the input Feynman diagram to enhance in multi-channel mode if 1 to
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -53,6 +54,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE
       END INTERFACE
 
@@ -66,10 +68,11 @@ C - RNDCOL:  the input random number Fortran array for color selection
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -78,6 +81,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL
       END INTERFACE
 
diff --git a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt
index 71b6f32fa3..d530a89960 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt
+++ b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt
@@ -150,7 +150,7 @@ INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1
 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 
 INFO: Creating files in directory P1_gg_bbx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1152][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fb9a2f97fa0> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fa44dca6fa0> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -170,19 +170,19 @@ INFO: Finding symmetric diagrams for subprocess group gg_bbx
 [1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4} [1;30m[model_handling.py at line 1544][0m [0m
 [1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4} [1;30m[model_handling.py at line 1545][0m [0m
 Generated helas calls for 1 subprocesses (4 diagrams) in 0.009 s
-Wrote files for 12 helas calls in 0.118 s
+Wrote files for 12 helas calls in 0.119 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVS3 routines[0m
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFS2 routines[0m
-ALOHA: aloha creates 4 routines in  0.269 s
+ALOHA: aloha creates 4 routines in  0.262 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVS3 routines[0m
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFS2 routines[0m
-ALOHA: aloha creates 8 routines in  0.251 s
+ALOHA: aloha creates 8 routines in  0.249 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVS3
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -219,9 +219,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.198s
-user	0m1.905s
-sys	0m0.268s
+real	0m3.154s
+user	0m1.883s
+sys	0m0.276s
 Code generation completed in 3 seconds
 ************************************************************
 *                                                          *
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h
index 4bf2198dd1..60eb101a6a 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h
@@ -109,9 +109,9 @@ namespace mg5amcCpu
      * @param rndcol the pointer to the input random numbers for color selection
      * @param channelId the Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0)
      * @param mes the pointer to the output matrix elements
-     * @param goodHelOnly quit after computing good helicities?
      * @param selhel the pointer to the output selected helicities
      * @param selcol the pointer to the output selected colors
+     * @param goodHelOnly quit after computing good helicities?
      */
     void gpu_sequence( const FORTRANFPTYPE* momenta,
                        const FORTRANFPTYPE* gs,
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig1.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig1.f
index d2b257590d..b8bcf54554 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig1.f
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig1.f
@@ -516,7 +516,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
       
       IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 #endif
-        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
 !$OMP PARALLEL
 !$OMP DO
         DO IVEC=1, VECSIZE_USED
@@ -532,7 +532,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
         ENDDO
 !$OMP END DO
 !$OMP END PARALLEL
-        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
+        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
       ENDIF
 
@@ -542,9 +542,10 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           STOP
         ENDIF
         IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
+          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
           FIRST = .FALSE.
 c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
           IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -558,22 +559,23 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           ENDIF
           WRITE (6,*) 'NGOODHEL =', NGOODHEL
           WRITE (6,*) 'NCOMB =', NCOMB
+          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
         ENDIF
-        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
         IF ( .NOT. MULTI_CHANNEL ) THEN
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ELSE
           IF( SDE_STRAT.NE.1 ) THEN
             WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
             STOP
           ENDIF
-          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
      &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ENDIF
-        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
+        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
       ENDIF
 
       IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/fcheck_sa.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/fcheck_sa.f
index 37d586be72..f0220047d7 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/fcheck_sa.f
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/matrix1.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/matrix1.f
index 9ae8713f43..5c1baf8703 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/matrix1.f
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/matrix1.f
@@ -143,7 +143,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 C     BEGIN CODE
 C     ----------
 
-      call counters_smatrix1_start()
       NTRY(IMIRROR)=NTRY(IMIRROR)+1
       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
       DO I=1,NEXTERNAL
@@ -249,7 +248,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         IHEL = HEL_PICKED
       ELSE
         ANS = 1D0
-        call counters_smatrix1_stop()
         RETURN
       ENDIF
       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
@@ -294,8 +292,9 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         ENDIF
       ENDIF
       ANS=ANS/DBLE(IDEN)
+
       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
-      call counters_smatrix1_stop()
+
       END
 
 
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/counters.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/counters.cc
index 742575a6a5..8ef58cce80 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/counters.cc
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/counters.cc
@@ -21,26 +21,24 @@ extern "C"
 {
   // Now: fortran=-1, cudacpp=0
   // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
+  constexpr unsigned int nimplC = 3;
   constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
   const char* iimplC2TXT( int iimplC )
   {
     const int iimplF = iimplC - 1;
     switch( iimplF )
     {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
+      case -1: return "Fortran MEs"; break;
+      case +0: return "CudaCpp MEs"; break;
+      case +1: return "CudaCpp HEL"; break;
       default: assert( false ); break;
     }
   }
 
   static mgOnGpu::Timer<TIMERTYPE> program_timer;
   static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
   static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
   static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
   static int smatrix1multi_counter[nimplC] = { 0 };
 
   void counters_initialise_()
@@ -49,19 +47,6 @@ extern "C"
     return;
   }
 
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
   void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
   {
     const unsigned int iimplC = iimplF2C( *iimplF );
@@ -86,13 +71,23 @@ extern "C"
     printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
     printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
     for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
+    {
       if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+      {
+        if( iimplC < nimplC - 1 ) // MEs
+          printf( " [COUNTERS] %11s      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
+                  iimplC2TXT( iimplC ),
+                  iimplC + 1,
+                  smatrix1multi_totaltime[iimplC],
+                  smatrix1multi_counter[iimplC],
+                  smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+        else
+          printf( " [COUNTERS] %11s      ( %1d ) : %9.4fs\n",
+                  iimplC2TXT( iimplC ),
+                  iimplC + 1,
+                  smatrix1multi_totaltime[iimplC] );
+      }
+    }
     return;
   }
 }
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk
index 359f16c029..9cff5e1a60 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk
@@ -116,15 +116,31 @@ override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 # Set HIP_HOME from the path to hipcc, if it exists
 override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
 
-# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists
-# (FIXME? Is there any equivalent of NVTX FOR HIP? What should be configured if both CUDA and HIP are installed?)
-ifneq ($(CUDA_HOME),)
-  USE_NVTX ?=-DUSE_NVTX
-  CUDA_INC = -I$(CUDA_HOME)/include/
+# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
+ifeq ($(CUDA_HOME),)
+  # CUDA_HOME is empty (nvcc not found)
+  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/),)
+  # CUDA_HOME is defined (nvcc was found) but $(CUDA_HOME)/include/ does not exist?
+  override CUDA_INC=
 else
+  CUDA_INC = -I$(CUDA_HOME)/include/
+endif
+###$(info CUDA_INC=$(CUDA_INC))
+
+# Configure NVTX if a CUDA include directory exists and NVTX headers exist (see #965)
+ifeq ($(CUDA_INC),)
+  # $(CUDA_HOME)/include/ does not exist
   override USE_NVTX=
-  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/nvtx3/nvToolsExt.h),)
+  # $(CUDA_HOME)/include/ exists but NVTX headers do not exist?
+  override USE_NVTX=
+else
+  # $(CUDA_HOME)/include/nvtx.h exists: use NVTX
+  # (NB: the option to disable NVTX if 'USE_NVTX=' is defined has been removed)
+  override USE_NVTX=-DUSE_NVTX
 endif
+###$(info USE_NVTX=$(USE_NVTX))
 
 # NB: NEW LOGIC FOR ENABLING AND DISABLING CUDA OR HIP BUILDS (AV Feb-Mar 2024)
 # - In the old implementation, by default the C++ targets for one specific AVX were always built together with either CUDA or HIP.
@@ -424,13 +440,18 @@ endif
 # (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...)
 ifeq ($(HASCURAND),)
   ifeq ($(GPUCC),) # CPU-only build
-    ifneq ($(CUDA_HOME),)
+    ifeq ($(CUDA_INC),)
+      # $(CUDA_HOME)/include/ does not exist (see #965)
+      override HASCURAND = hasNoCurand
+    else ifeq ($(wildcard $(CUDA_HOME)/include/curand.h),)
+      # $(CUDA_HOME)/include/ exists but CURAND headers do not exist? (see #965)
+      override HASCURAND = hasNoCurand
+    else
       # By default, assume that curand is installed if a CUDA installation exists
       override HASCURAND = hasCurand
-    else
-      override HASCURAND = hasNoCurand
     endif
   else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    # By default, assume that curand is installed if a CUDA build is requested
     override HASCURAND = hasCurand
   else # non-Nvidia GPU build
     override HASCURAND = hasNoCurand
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.cc
index 8a5b8be9c0..99efcb1dbe 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.cc
@@ -83,6 +83,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_( CppObjectInFortran** ppbridge,
                          const FORTRANFPTYPE* momenta,
@@ -92,18 +93,20 @@ extern "C"
                          const unsigned int* pchannelId,
                          FORTRANFPTYPE* mes,
                          int* selhel,
-                         int* selcol )
+                         int* selcol,
+                         const bool* pgoodHelOnly )
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
+    //printf("fbridgesequence_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
 #ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
-    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #else
     // Use the host/CPU implementation in the C++ library
     // (there is no device implementation in this library)
-    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #endif
   }
 
@@ -119,6 +122,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
                                         const FORTRANFPTYPE* momenta,
@@ -127,9 +131,11 @@ extern "C"
                                         const FORTRANFPTYPE* rndcol,
                                         FORTRANFPTYPE* mes,
                                         int* selhel,
-                                        int* selcol )
+                                        int* selcol,
+                                        const bool* pgoodHelOnly )
   {
-    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol );
+    //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
+    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly );
   }
 
   /**
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.inc
index 422aa67cf9..a28622cdb6 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.inc
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/fbridge.inc
@@ -40,10 +40,11 @@ C - CHANID:  the input Feynman diagram to enhance in multi-channel mode if 1 to
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -53,6 +54,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE
       END INTERFACE
 
@@ -66,10 +68,11 @@ C - RNDCOL:  the input random number Fortran array for color selection
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -78,6 +81,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL
       END INTERFACE
 
diff --git a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt
index b38ca5ac91..14cb5a6988 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt
+++ b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt
@@ -156,7 +156,7 @@ ALOHA: aloha creates VVS3 routines[0m
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFS2 routines[0m
-ALOHA: aloha creates 4 routines in  0.264 s
+ALOHA: aloha creates 4 routines in  0.278 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVS3
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -173,7 +173,7 @@ INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. 
 quit
 
-real	0m0.666s
-user	0m0.585s
-sys	0m0.056s
+real	0m0.756s
+user	0m0.610s
+sys	0m0.064s
 Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h
index 4bf2198dd1..60eb101a6a 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h
@@ -109,9 +109,9 @@ namespace mg5amcCpu
      * @param rndcol the pointer to the input random numbers for color selection
      * @param channelId the Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0)
      * @param mes the pointer to the output matrix elements
-     * @param goodHelOnly quit after computing good helicities?
      * @param selhel the pointer to the output selected helicities
      * @param selcol the pointer to the output selected colors
+     * @param goodHelOnly quit after computing good helicities?
      */
     void gpu_sequence( const FORTRANFPTYPE* momenta,
                        const FORTRANFPTYPE* gs,
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/fcheck_sa.f b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/fcheck_sa.f
index 37d586be72..f0220047d7 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/fcheck_sa.f
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk
index 359f16c029..9cff5e1a60 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk
@@ -116,15 +116,31 @@ override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 # Set HIP_HOME from the path to hipcc, if it exists
 override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
 
-# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists
-# (FIXME? Is there any equivalent of NVTX FOR HIP? What should be configured if both CUDA and HIP are installed?)
-ifneq ($(CUDA_HOME),)
-  USE_NVTX ?=-DUSE_NVTX
-  CUDA_INC = -I$(CUDA_HOME)/include/
+# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
+ifeq ($(CUDA_HOME),)
+  # CUDA_HOME is empty (nvcc not found)
+  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/),)
+  # CUDA_HOME is defined (nvcc was found) but $(CUDA_HOME)/include/ does not exist?
+  override CUDA_INC=
 else
+  CUDA_INC = -I$(CUDA_HOME)/include/
+endif
+###$(info CUDA_INC=$(CUDA_INC))
+
+# Configure NVTX if a CUDA include directory exists and NVTX headers exist (see #965)
+ifeq ($(CUDA_INC),)
+  # $(CUDA_HOME)/include/ does not exist
   override USE_NVTX=
-  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/nvtx3/nvToolsExt.h),)
+  # $(CUDA_HOME)/include/ exists but NVTX headers do not exist?
+  override USE_NVTX=
+else
+  # $(CUDA_HOME)/include/nvtx.h exists: use NVTX
+  # (NB: the option to disable NVTX if 'USE_NVTX=' is defined has been removed)
+  override USE_NVTX=-DUSE_NVTX
 endif
+###$(info USE_NVTX=$(USE_NVTX))
 
 # NB: NEW LOGIC FOR ENABLING AND DISABLING CUDA OR HIP BUILDS (AV Feb-Mar 2024)
 # - In the old implementation, by default the C++ targets for one specific AVX were always built together with either CUDA or HIP.
@@ -424,13 +440,18 @@ endif
 # (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...)
 ifeq ($(HASCURAND),)
   ifeq ($(GPUCC),) # CPU-only build
-    ifneq ($(CUDA_HOME),)
+    ifeq ($(CUDA_INC),)
+      # $(CUDA_HOME)/include/ does not exist (see #965)
+      override HASCURAND = hasNoCurand
+    else ifeq ($(wildcard $(CUDA_HOME)/include/curand.h),)
+      # $(CUDA_HOME)/include/ exists but CURAND headers do not exist? (see #965)
+      override HASCURAND = hasNoCurand
+    else
       # By default, assume that curand is installed if a CUDA installation exists
       override HASCURAND = hasCurand
-    else
-      override HASCURAND = hasNoCurand
     endif
   else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    # By default, assume that curand is installed if a CUDA build is requested
     override HASCURAND = hasCurand
   else # non-Nvidia GPU build
     override HASCURAND = hasNoCurand
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.cc
index 8a5b8be9c0..99efcb1dbe 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.cc
@@ -83,6 +83,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_( CppObjectInFortran** ppbridge,
                          const FORTRANFPTYPE* momenta,
@@ -92,18 +93,20 @@ extern "C"
                          const unsigned int* pchannelId,
                          FORTRANFPTYPE* mes,
                          int* selhel,
-                         int* selcol )
+                         int* selcol,
+                         const bool* pgoodHelOnly )
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
+    //printf("fbridgesequence_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
 #ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
-    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #else
     // Use the host/CPU implementation in the C++ library
     // (there is no device implementation in this library)
-    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #endif
   }
 
@@ -119,6 +122,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
                                         const FORTRANFPTYPE* momenta,
@@ -127,9 +131,11 @@ extern "C"
                                         const FORTRANFPTYPE* rndcol,
                                         FORTRANFPTYPE* mes,
                                         int* selhel,
-                                        int* selcol )
+                                        int* selcol,
+                                        const bool* pgoodHelOnly )
   {
-    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol );
+    //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
+    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly );
   }
 
   /**
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.inc
index 422aa67cf9..a28622cdb6 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.inc
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/fbridge.inc
@@ -40,10 +40,11 @@ C - CHANID:  the input Feynman diagram to enhance in multi-channel mode if 1 to
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -53,6 +54,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE
       END INTERFACE
 
@@ -66,10 +68,11 @@ C - RNDCOL:  the input random number Fortran array for color selection
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -78,6 +81,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL
       END INTERFACE
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
index 439cf73e6a..c6b7a90b66 100644
--- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
+++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define j = p
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005627632141113281 [0m
+[1;32mDEBUG: model prefixing  takes 0.00522923469543457 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -212,7 +212,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~
 INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g 
 INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ 
 INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g 
-13 processes with 76 diagrams generated in 0.137 s
+13 processes with 76 diagrams generated in 0.135 s
 Total: 18 processes with 83 diagrams
 add process p p > t t~ j j @2
 INFO: Checking for minimal orders which gives processes. 
@@ -378,7 +378,7 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~
 INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ 
 INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ 
 INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. 
-65 processes with 1119 diagrams generated in 1.816 s
+65 processes with 1119 diagrams generated in 1.855 s
 Total: 83 processes with 1202 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32
 Load PLUGIN.CUDACPP_OUTPUT
@@ -500,7 +500,7 @@ INFO: Combined process d d~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED
 INFO: Combined process s s~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 
 INFO: Creating files in directory P2_gg_ttxgg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1152][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f5088d215e0> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd323a3f700> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -521,7 +521,7 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttxgg
 [1;32mDEBUG:  diag_to_iconfig = [0m {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [1;30m[model_handling.py at line 1545][0m [0m
 INFO: Creating files in directory P2_gg_ttxuux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1152][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f50888303a0> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd3240d87c0> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -542,7 +542,7 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttxuux
 [1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1545][0m [0m
 INFO: Creating files in directory P2_gu_ttxgu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1152][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f5088d215e0> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd323ce7d00> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -563,7 +563,7 @@ INFO: Finding symmetric diagrams for subprocess group gu_ttxgu
 [1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1545][0m [0m
 INFO: Creating files in directory P2_gux_ttxgux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1152][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f508870b940> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd3240d87c0> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -584,7 +584,7 @@ INFO: Finding symmetric diagrams for subprocess group gux_ttxgux
 [1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1545][0m [0m
 INFO: Creating files in directory P2_uux_ttxgg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1152][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f5088c1c370> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd323e3e400> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -605,7 +605,7 @@ INFO: Finding symmetric diagrams for subprocess group uux_ttxgg
 [1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1545][0m [0m
 INFO: Creating files in directory P1_gg_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1152][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f508888be50> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd323a3f700> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -626,7 +626,7 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttxg
 [1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1545][0m [0m
 INFO: Creating files in directory P2_uu_ttxuu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1152][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f5088c1c370> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd323a3f700> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -647,7 +647,7 @@ INFO: Finding symmetric diagrams for subprocess group uu_ttxuu
 [1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1545][0m [0m
 INFO: Creating files in directory P2_uux_ttxuux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1152][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f5088c1cee0> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd323d90850> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -668,7 +668,7 @@ INFO: Finding symmetric diagrams for subprocess group uux_ttxuux
 [1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1545][0m [0m
 INFO: Creating files in directory P2_uxux_ttxuxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1152][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f5088d8dcd0> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd323d3cf70> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -689,7 +689,7 @@ INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux
 [1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1545][0m [0m
 INFO: Creating files in directory P2_uc_ttxuc 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1152][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f5088d215e0> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd3241e25b0> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -710,7 +710,7 @@ INFO: Finding symmetric diagrams for subprocess group uc_ttxuc
 [1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1545][0m [0m
 INFO: Creating files in directory P2_uux_ttxccx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1152][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f508870b880> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd3241e25b0> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -731,7 +731,7 @@ INFO: Finding symmetric diagrams for subprocess group uux_ttxccx
 [1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1545][0m [0m
 INFO: Creating files in directory P2_ucx_ttxucx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1152][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f5088c1c370> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd3241e25b0> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -752,7 +752,7 @@ INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx
 [1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1545][0m [0m
 INFO: Creating files in directory P2_uxcx_ttxuxcx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1152][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f5088c1cee0> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd323d3ce20> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -773,7 +773,7 @@ INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx
 [1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1545][0m [0m
 INFO: Creating files in directory P1_gu_ttxu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1152][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f5088833fa0> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd3241e25b0> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -794,7 +794,7 @@ INFO: Finding symmetric diagrams for subprocess group gu_ttxu
 [1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1545][0m [0m
 INFO: Creating files in directory P1_gux_ttxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1152][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f508870b940> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd323e3e490> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -815,7 +815,7 @@ INFO: Finding symmetric diagrams for subprocess group gux_ttxux
 [1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1545][0m [0m
 INFO: Creating files in directory P1_uux_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1152][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f5088854880> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd323e3e490> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -836,7 +836,7 @@ INFO: Finding symmetric diagrams for subprocess group uux_ttxg
 [1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1545][0m [0m
 INFO: Creating files in directory P0_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1152][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f50888301c0> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd323ce4640> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -857,7 +857,7 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttx
 [1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1545][0m [0m
 INFO: Creating files in directory P0_uux_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1152][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f50888303a0> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd323d4b880> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -876,22 +876,22 @@ INFO: Finding symmetric diagrams for subprocess group uux_ttx
 [1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 1 [1;30m[model_handling.py at line 1520][0m [0m
 [1;32mDEBUG:  iconfig_to_diag = [0m {1: 1} [1;30m[model_handling.py at line 1544][0m [0m
 [1;32mDEBUG:  diag_to_iconfig = [0m {1: 1} [1;30m[model_handling.py at line 1545][0m [0m
-Generated helas calls for 18 subprocesses (372 diagrams) in 1.297 s
-Wrote files for 810 helas calls in 4.490 s
+Generated helas calls for 18 subprocesses (372 diagrams) in 1.293 s
+Wrote files for 810 helas calls in 3.534 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.341 s
+ALOHA: aloha creates 5 routines in  0.335 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.318 s
+ALOHA: aloha creates 10 routines in  0.315 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -1100,10 +1100,10 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m12.229s
-user	0m10.360s
-sys	0m0.958s
-Code generation completed in 13 seconds
+real	0m11.245s
+user	0m10.299s
+sys	0m0.899s
+Code generation completed in 12 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h
index 4bf2198dd1..60eb101a6a 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h
@@ -109,9 +109,9 @@ namespace mg5amcCpu
      * @param rndcol the pointer to the input random numbers for color selection
      * @param channelId the Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0)
      * @param mes the pointer to the output matrix elements
-     * @param goodHelOnly quit after computing good helicities?
      * @param selhel the pointer to the output selected helicities
      * @param selcol the pointer to the output selected colors
+     * @param goodHelOnly quit after computing good helicities?
      */
     void gpu_sequence( const FORTRANFPTYPE* momenta,
                        const FORTRANFPTYPE* gs,
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f
index 7bd8ec493e..c08c7c485d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f
@@ -516,7 +516,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
       
       IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 #endif
-        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
 !$OMP PARALLEL
 !$OMP DO
         DO IVEC=1, VECSIZE_USED
@@ -532,7 +532,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
         ENDDO
 !$OMP END DO
 !$OMP END PARALLEL
-        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
+        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
       ENDIF
 
@@ -542,9 +542,10 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           STOP
         ENDIF
         IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
+          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
           FIRST = .FALSE.
 c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
           IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -558,22 +559,23 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           ENDIF
           WRITE (6,*) 'NGOODHEL =', NGOODHEL
           WRITE (6,*) 'NCOMB =', NCOMB
+          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
         ENDIF
-        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
         IF ( .NOT. MULTI_CHANNEL ) THEN
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ELSE
           IF( SDE_STRAT.NE.1 ) THEN
             WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
             STOP
           ENDIF
-          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
      &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ENDIF
-        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
+        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
       ENDIF
 
       IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/fcheck_sa.f
index 37d586be72..f0220047d7 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/fcheck_sa.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f
index b1f45c3af7..a912a12c0f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f
@@ -143,7 +143,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 C     BEGIN CODE
 C     ----------
 
-      call counters_smatrix1_start()
       NTRY(IMIRROR)=NTRY(IMIRROR)+1
       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
       DO I=1,NEXTERNAL
@@ -249,7 +248,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         IHEL = HEL_PICKED
       ELSE
         ANS = 1D0
-        call counters_smatrix1_stop()
         RETURN
       ENDIF
       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
@@ -294,8 +292,9 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         ENDIF
       ENDIF
       ANS=ANS/DBLE(IDEN)
+
       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
-      call counters_smatrix1_stop()
+
       END
 
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f
index c4e476d6c0..868a3ef6c6 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f
@@ -571,7 +571,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
       
       IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 #endif
-        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
 !$OMP PARALLEL
 !$OMP DO
         DO IVEC=1, VECSIZE_USED
@@ -587,7 +587,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
         ENDDO
 !$OMP END DO
 !$OMP END PARALLEL
-        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
+        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
       ENDIF
 
@@ -597,9 +597,10 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           STOP
         ENDIF
         IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
+          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
           FIRST = .FALSE.
 c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
           IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -613,22 +614,23 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           ENDIF
           WRITE (6,*) 'NGOODHEL =', NGOODHEL
           WRITE (6,*) 'NCOMB =', NCOMB
+          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
         ENDIF
-        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
         IF ( .NOT. MULTI_CHANNEL ) THEN
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ELSE
           IF( SDE_STRAT.NE.1 ) THEN
             WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
             STOP
           ENDIF
-          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
      &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ENDIF
-        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
+        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
       ENDIF
 
       IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/fcheck_sa.f
index 37d586be72..f0220047d7 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/fcheck_sa.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f
index 8d74ac5b98..d30687b866 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f
@@ -146,7 +146,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 C     BEGIN CODE
 C     ----------
 
-      call counters_smatrix1_start()
       NTRY(IMIRROR)=NTRY(IMIRROR)+1
       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
       DO I=1,NEXTERNAL
@@ -252,7 +251,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         IHEL = HEL_PICKED
       ELSE
         ANS = 1D0
-        call counters_smatrix1_stop()
         RETURN
       ENDIF
       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
@@ -297,8 +295,9 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         ENDIF
       ENDIF
       ANS=ANS/DBLE(IDEN)
+
       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
-      call counters_smatrix1_stop()
+
       END
 
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
index c9ca1538d3..1c3ba92e6d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
@@ -516,7 +516,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
       
       IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 #endif
-        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
 !$OMP PARALLEL
 !$OMP DO
         DO IVEC=1, VECSIZE_USED
@@ -532,7 +532,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
         ENDDO
 !$OMP END DO
 !$OMP END PARALLEL
-        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
+        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
       ENDIF
 
@@ -542,9 +542,10 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           STOP
         ENDIF
         IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
+          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
           FIRST = .FALSE.
 c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
           IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -558,22 +559,23 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           ENDIF
           WRITE (6,*) 'NGOODHEL =', NGOODHEL
           WRITE (6,*) 'NCOMB =', NCOMB
+          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
         ENDIF
-        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
         IF ( .NOT. MULTI_CHANNEL ) THEN
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ELSE
           IF( SDE_STRAT.NE.1 ) THEN
             WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
             STOP
           ENDIF
-          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
      &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ENDIF
-        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
+        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
       ENDIF
 
       IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/fcheck_sa.f
index 9170a32a19..cb7efdfbcf 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/fcheck_sa.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f
index 3d035277eb..6fdf8a8d07 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f
@@ -159,7 +159,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 C     BEGIN CODE
 C     ----------
 
-      call counters_smatrix1_start()
       NTRY(IMIRROR)=NTRY(IMIRROR)+1
       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
       DO I=1,NEXTERNAL
@@ -265,7 +264,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         IHEL = HEL_PICKED
       ELSE
         ANS = 1D0
-        call counters_smatrix1_stop()
         RETURN
       ENDIF
       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
@@ -310,8 +308,9 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         ENDIF
       ENDIF
       ANS=ANS/DBLE(IDEN)
+
       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
-      call counters_smatrix1_stop()
+
       END
 
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f
index 2c11f53b89..3d7efb5585 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f
@@ -560,7 +560,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
       
       IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 #endif
-        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
 !$OMP PARALLEL
 !$OMP DO
         DO IVEC=1, VECSIZE_USED
@@ -576,7 +576,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
         ENDDO
 !$OMP END DO
 !$OMP END PARALLEL
-        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
+        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
       ENDIF
 
@@ -586,9 +586,10 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           STOP
         ENDIF
         IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
+          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
           FIRST = .FALSE.
 c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
           IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -602,22 +603,23 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           ENDIF
           WRITE (6,*) 'NGOODHEL =', NGOODHEL
           WRITE (6,*) 'NCOMB =', NCOMB
+          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
         ENDIF
-        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
         IF ( .NOT. MULTI_CHANNEL ) THEN
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ELSE
           IF( SDE_STRAT.NE.1 ) THEN
             WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
             STOP
           ENDIF
-          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
      &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ENDIF
-        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
+        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
       ENDIF
 
       IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/fcheck_sa.f
index 9170a32a19..cb7efdfbcf 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/fcheck_sa.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f
index 0a318e1c05..259aaec8a1 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f
@@ -162,7 +162,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 C     BEGIN CODE
 C     ----------
 
-      call counters_smatrix1_start()
       NTRY(IMIRROR)=NTRY(IMIRROR)+1
       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
       DO I=1,NEXTERNAL
@@ -268,7 +267,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         IHEL = HEL_PICKED
       ELSE
         ANS = 1D0
-        call counters_smatrix1_stop()
         RETURN
       ENDIF
       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
@@ -313,8 +311,9 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         ENDIF
       ENDIF
       ANS=ANS/DBLE(IDEN)
+
       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
-      call counters_smatrix1_stop()
+
       END
 
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f
index d829a73049..d65bac7611 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f
@@ -560,7 +560,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
       
       IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 #endif
-        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
 !$OMP PARALLEL
 !$OMP DO
         DO IVEC=1, VECSIZE_USED
@@ -576,7 +576,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
         ENDDO
 !$OMP END DO
 !$OMP END PARALLEL
-        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
+        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
       ENDIF
 
@@ -586,9 +586,10 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           STOP
         ENDIF
         IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
+          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
           FIRST = .FALSE.
 c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
           IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -602,22 +603,23 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           ENDIF
           WRITE (6,*) 'NGOODHEL =', NGOODHEL
           WRITE (6,*) 'NCOMB =', NCOMB
+          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
         ENDIF
-        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
         IF ( .NOT. MULTI_CHANNEL ) THEN
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ELSE
           IF( SDE_STRAT.NE.1 ) THEN
             WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
             STOP
           ENDIF
-          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
      &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ENDIF
-        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
+        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
       ENDIF
 
       IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/fcheck_sa.f
index 9170a32a19..cb7efdfbcf 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/fcheck_sa.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f
index f012b48d83..f85cd82256 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f
@@ -162,7 +162,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 C     BEGIN CODE
 C     ----------
 
-      call counters_smatrix1_start()
       NTRY(IMIRROR)=NTRY(IMIRROR)+1
       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
       DO I=1,NEXTERNAL
@@ -268,7 +267,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         IHEL = HEL_PICKED
       ELSE
         ANS = 1D0
-        call counters_smatrix1_stop()
         RETURN
       ENDIF
       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
@@ -313,8 +311,9 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         ENDIF
       ENDIF
       ANS=ANS/DBLE(IDEN)
+
       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
-      call counters_smatrix1_stop()
+
       END
 
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f
index 0eb22610bf..89f360f028 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f
@@ -571,7 +571,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
       
       IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 #endif
-        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
 !$OMP PARALLEL
 !$OMP DO
         DO IVEC=1, VECSIZE_USED
@@ -587,7 +587,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
         ENDDO
 !$OMP END DO
 !$OMP END PARALLEL
-        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
+        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
       ENDIF
 
@@ -597,9 +597,10 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           STOP
         ENDIF
         IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
+          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
           FIRST = .FALSE.
 c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
           IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -613,22 +614,23 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           ENDIF
           WRITE (6,*) 'NGOODHEL =', NGOODHEL
           WRITE (6,*) 'NCOMB =', NCOMB
+          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
         ENDIF
-        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
         IF ( .NOT. MULTI_CHANNEL ) THEN
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ELSE
           IF( SDE_STRAT.NE.1 ) THEN
             WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
             STOP
           ENDIF
-          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
      &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ENDIF
-        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
+        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
       ENDIF
 
       IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/fcheck_sa.f
index 9170a32a19..cb7efdfbcf 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/fcheck_sa.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f
index 3d2319b36a..2f6c72fb43 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f
@@ -162,7 +162,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 C     BEGIN CODE
 C     ----------
 
-      call counters_smatrix1_start()
       NTRY(IMIRROR)=NTRY(IMIRROR)+1
       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
       DO I=1,NEXTERNAL
@@ -268,7 +267,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         IHEL = HEL_PICKED
       ELSE
         ANS = 1D0
-        call counters_smatrix1_stop()
         RETURN
       ENDIF
       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
@@ -313,8 +311,9 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         ENDIF
       ENDIF
       ANS=ANS/DBLE(IDEN)
+
       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
-      call counters_smatrix1_stop()
+
       END
 
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f
index 6a17e242b2..85dd15d507 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f
@@ -516,7 +516,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
       
       IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 #endif
-        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
 !$OMP PARALLEL
 !$OMP DO
         DO IVEC=1, VECSIZE_USED
@@ -532,7 +532,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
         ENDDO
 !$OMP END DO
 !$OMP END PARALLEL
-        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
+        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
       ENDIF
 
@@ -542,9 +542,10 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           STOP
         ENDIF
         IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
+          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
           FIRST = .FALSE.
 c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
           IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -558,22 +559,23 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           ENDIF
           WRITE (6,*) 'NGOODHEL =', NGOODHEL
           WRITE (6,*) 'NCOMB =', NCOMB
+          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
         ENDIF
-        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
         IF ( .NOT. MULTI_CHANNEL ) THEN
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ELSE
           IF( SDE_STRAT.NE.1 ) THEN
             WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
             STOP
           ENDIF
-          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
      &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ENDIF
-        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
+        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
       ENDIF
 
       IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/fcheck_sa.f
index 32f6c3207c..6a66bac979 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/fcheck_sa.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f
index 926b17aa45..2d877b9bc0 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f
@@ -191,7 +191,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 C     BEGIN CODE
 C     ----------
 
-      call counters_smatrix1_start()
       NTRY(IMIRROR)=NTRY(IMIRROR)+1
       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
       DO I=1,NEXTERNAL
@@ -297,7 +296,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         IHEL = HEL_PICKED
       ELSE
         ANS = 1D0
-        call counters_smatrix1_stop()
         RETURN
       ENDIF
       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
@@ -342,8 +340,9 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         ENDIF
       ENDIF
       ANS=ANS/DBLE(IDEN)
+
       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
-      call counters_smatrix1_stop()
+
       END
 
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f
index a952958df8..0717127ecc 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f
@@ -549,7 +549,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
       
       IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 #endif
-        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
 !$OMP PARALLEL
 !$OMP DO
         DO IVEC=1, VECSIZE_USED
@@ -565,7 +565,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
         ENDDO
 !$OMP END DO
 !$OMP END PARALLEL
-        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
+        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
       ENDIF
 
@@ -575,9 +575,10 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           STOP
         ENDIF
         IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
+          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
           FIRST = .FALSE.
 c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
           IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -591,22 +592,23 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           ENDIF
           WRITE (6,*) 'NGOODHEL =', NGOODHEL
           WRITE (6,*) 'NCOMB =', NCOMB
+          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
         ENDIF
-        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
         IF ( .NOT. MULTI_CHANNEL ) THEN
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ELSE
           IF( SDE_STRAT.NE.1 ) THEN
             WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
             STOP
           ENDIF
-          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
      &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ENDIF
-        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
+        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
       ENDIF
 
       IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/fcheck_sa.f
index 32f6c3207c..6a66bac979 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/fcheck_sa.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f
index 520aaec0b1..74f9ed957c 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f
@@ -194,7 +194,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 C     BEGIN CODE
 C     ----------
 
-      call counters_smatrix1_start()
       NTRY(IMIRROR)=NTRY(IMIRROR)+1
       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
       DO I=1,NEXTERNAL
@@ -300,7 +299,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         IHEL = HEL_PICKED
       ELSE
         ANS = 1D0
-        call counters_smatrix1_stop()
         RETURN
       ENDIF
       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
@@ -345,8 +343,9 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         ENDIF
       ENDIF
       ANS=ANS/DBLE(IDEN)
+
       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
-      call counters_smatrix1_stop()
+
       END
 
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f
index a41c6f876a..78a109f493 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f
@@ -560,7 +560,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
       
       IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 #endif
-        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
 !$OMP PARALLEL
 !$OMP DO
         DO IVEC=1, VECSIZE_USED
@@ -576,7 +576,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
         ENDDO
 !$OMP END DO
 !$OMP END PARALLEL
-        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
+        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
       ENDIF
 
@@ -586,9 +586,10 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           STOP
         ENDIF
         IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
+          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
           FIRST = .FALSE.
 c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
           IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -602,22 +603,23 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           ENDIF
           WRITE (6,*) 'NGOODHEL =', NGOODHEL
           WRITE (6,*) 'NCOMB =', NCOMB
+          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
         ENDIF
-        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
         IF ( .NOT. MULTI_CHANNEL ) THEN
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ELSE
           IF( SDE_STRAT.NE.1 ) THEN
             WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
             STOP
           ENDIF
-          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
      &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ENDIF
-        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
+        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
       ENDIF
 
       IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/fcheck_sa.f
index 32f6c3207c..6a66bac979 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/fcheck_sa.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f
index f77bfa066c..07469eded9 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f
@@ -194,7 +194,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 C     BEGIN CODE
 C     ----------
 
-      call counters_smatrix1_start()
       NTRY(IMIRROR)=NTRY(IMIRROR)+1
       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
       DO I=1,NEXTERNAL
@@ -300,7 +299,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         IHEL = HEL_PICKED
       ELSE
         ANS = 1D0
-        call counters_smatrix1_stop()
         RETURN
       ENDIF
       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
@@ -345,8 +343,9 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         ENDIF
       ENDIF
       ANS=ANS/DBLE(IDEN)
+
       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
-      call counters_smatrix1_stop()
+
       END
 
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f
index 700cdbece2..e40cd6c43f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f
@@ -560,7 +560,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
       
       IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 #endif
-        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
 !$OMP PARALLEL
 !$OMP DO
         DO IVEC=1, VECSIZE_USED
@@ -576,7 +576,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
         ENDDO
 !$OMP END DO
 !$OMP END PARALLEL
-        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
+        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
       ENDIF
 
@@ -586,9 +586,10 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           STOP
         ENDIF
         IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
+          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
           FIRST = .FALSE.
 c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
           IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -602,22 +603,23 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           ENDIF
           WRITE (6,*) 'NGOODHEL =', NGOODHEL
           WRITE (6,*) 'NCOMB =', NCOMB
+          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
         ENDIF
-        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
         IF ( .NOT. MULTI_CHANNEL ) THEN
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ELSE
           IF( SDE_STRAT.NE.1 ) THEN
             WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
             STOP
           ENDIF
-          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
      &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ENDIF
-        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
+        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
       ENDIF
 
       IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/fcheck_sa.f
index 32f6c3207c..6a66bac979 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/fcheck_sa.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f
index 4c36b4bcce..a72674b621 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f
@@ -194,7 +194,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 C     BEGIN CODE
 C     ----------
 
-      call counters_smatrix1_start()
       NTRY(IMIRROR)=NTRY(IMIRROR)+1
       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
       DO I=1,NEXTERNAL
@@ -300,7 +299,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         IHEL = HEL_PICKED
       ELSE
         ANS = 1D0
-        call counters_smatrix1_stop()
         RETURN
       ENDIF
       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
@@ -345,8 +343,9 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         ENDIF
       ENDIF
       ANS=ANS/DBLE(IDEN)
+
       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
-      call counters_smatrix1_stop()
+
       END
 
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f
index bc898ac10e..7648cf57b1 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f
@@ -587,7 +587,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
       
       IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 #endif
-        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
 !$OMP PARALLEL
 !$OMP DO
         DO IVEC=1, VECSIZE_USED
@@ -603,7 +603,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
         ENDDO
 !$OMP END DO
 !$OMP END PARALLEL
-        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
+        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
       ENDIF
 
@@ -613,9 +613,10 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           STOP
         ENDIF
         IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
+          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
           FIRST = .FALSE.
 c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
           IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -629,22 +630,23 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           ENDIF
           WRITE (6,*) 'NGOODHEL =', NGOODHEL
           WRITE (6,*) 'NCOMB =', NCOMB
+          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
         ENDIF
-        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
         IF ( .NOT. MULTI_CHANNEL ) THEN
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ELSE
           IF( SDE_STRAT.NE.1 ) THEN
             WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
             STOP
           ENDIF
-          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
      &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ENDIF
-        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
+        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
       ENDIF
 
       IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/fcheck_sa.f
index 32f6c3207c..6a66bac979 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/fcheck_sa.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f
index eec298dc6c..1ea1b00778 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f
@@ -196,7 +196,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 C     BEGIN CODE
 C     ----------
 
-      call counters_smatrix1_start()
       NTRY(IMIRROR)=NTRY(IMIRROR)+1
       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
       DO I=1,NEXTERNAL
@@ -302,7 +301,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         IHEL = HEL_PICKED
       ELSE
         ANS = 1D0
-        call counters_smatrix1_stop()
         RETURN
       ENDIF
       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
@@ -347,8 +345,9 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         ENDIF
       ENDIF
       ANS=ANS/DBLE(IDEN)
+
       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
-      call counters_smatrix1_stop()
+
       END
 
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f
index 3db88ba2c3..deb87c2e1c 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f
@@ -659,7 +659,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
       
       IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 #endif
-        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
 !$OMP PARALLEL
 !$OMP DO
         DO IVEC=1, VECSIZE_USED
@@ -675,7 +675,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
         ENDDO
 !$OMP END DO
 !$OMP END PARALLEL
-        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
+        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
       ENDIF
 
@@ -685,9 +685,10 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           STOP
         ENDIF
         IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
+          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
           FIRST = .FALSE.
 c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
           IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -701,22 +702,23 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           ENDIF
           WRITE (6,*) 'NGOODHEL =', NGOODHEL
           WRITE (6,*) 'NCOMB =', NCOMB
+          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
         ENDIF
-        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
         IF ( .NOT. MULTI_CHANNEL ) THEN
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ELSE
           IF( SDE_STRAT.NE.1 ) THEN
             WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
             STOP
           ENDIF
-          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
      &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ENDIF
-        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
+        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
       ENDIF
 
       IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/fcheck_sa.f
index 32f6c3207c..6a66bac979 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/fcheck_sa.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f
index a530c382f1..62460f03a4 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f
@@ -202,7 +202,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 C     BEGIN CODE
 C     ----------
 
-      call counters_smatrix1_start()
       NTRY(IMIRROR)=NTRY(IMIRROR)+1
       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
       DO I=1,NEXTERNAL
@@ -308,7 +307,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         IHEL = HEL_PICKED
       ELSE
         ANS = 1D0
-        call counters_smatrix1_stop()
         RETURN
       ENDIF
       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
@@ -353,8 +351,9 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         ENDIF
       ENDIF
       ANS=ANS/DBLE(IDEN)
+
       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
-      call counters_smatrix1_stop()
+
       END
 
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f
index 8988ba6c1d..bd3cb3fcff 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f
@@ -571,7 +571,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
       
       IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 #endif
-        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
 !$OMP PARALLEL
 !$OMP DO
         DO IVEC=1, VECSIZE_USED
@@ -587,7 +587,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
         ENDDO
 !$OMP END DO
 !$OMP END PARALLEL
-        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
+        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
       ENDIF
 
@@ -597,9 +597,10 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           STOP
         ENDIF
         IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
+          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
           FIRST = .FALSE.
 c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
           IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -613,22 +614,23 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           ENDIF
           WRITE (6,*) 'NGOODHEL =', NGOODHEL
           WRITE (6,*) 'NCOMB =', NCOMB
+          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
         ENDIF
-        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
         IF ( .NOT. MULTI_CHANNEL ) THEN
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ELSE
           IF( SDE_STRAT.NE.1 ) THEN
             WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
             STOP
           ENDIF
-          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
      &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ENDIF
-        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
+        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
       ENDIF
 
       IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/fcheck_sa.f
index 32f6c3207c..6a66bac979 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/fcheck_sa.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f
index f6d8294bd3..e4c318e9f7 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f
@@ -194,7 +194,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 C     BEGIN CODE
 C     ----------
 
-      call counters_smatrix1_start()
       NTRY(IMIRROR)=NTRY(IMIRROR)+1
       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
       DO I=1,NEXTERNAL
@@ -300,7 +299,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         IHEL = HEL_PICKED
       ELSE
         ANS = 1D0
-        call counters_smatrix1_stop()
         RETURN
       ENDIF
       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
@@ -345,8 +343,9 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         ENDIF
       ENDIF
       ANS=ANS/DBLE(IDEN)
+
       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
-      call counters_smatrix1_stop()
+
       END
 
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f
index 37b6741d5b..ac61617b61 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f
@@ -659,7 +659,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
       
       IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 #endif
-        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
 !$OMP PARALLEL
 !$OMP DO
         DO IVEC=1, VECSIZE_USED
@@ -675,7 +675,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
         ENDDO
 !$OMP END DO
 !$OMP END PARALLEL
-        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
+        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
       ENDIF
 
@@ -685,9 +685,10 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           STOP
         ENDIF
         IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
+          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
           FIRST = .FALSE.
 c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
           IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -701,22 +702,23 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           ENDIF
           WRITE (6,*) 'NGOODHEL =', NGOODHEL
           WRITE (6,*) 'NCOMB =', NCOMB
+          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
         ENDIF
-        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
         IF ( .NOT. MULTI_CHANNEL ) THEN
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ELSE
           IF( SDE_STRAT.NE.1 ) THEN
             WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
             STOP
           ENDIF
-          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
      &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ENDIF
-        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
+        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
       ENDIF
 
       IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/fcheck_sa.f
index 32f6c3207c..6a66bac979 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/fcheck_sa.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f
index 4b974a1e79..b2be8a2661 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f
@@ -202,7 +202,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 C     BEGIN CODE
 C     ----------
 
-      call counters_smatrix1_start()
       NTRY(IMIRROR)=NTRY(IMIRROR)+1
       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
       DO I=1,NEXTERNAL
@@ -308,7 +307,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         IHEL = HEL_PICKED
       ELSE
         ANS = 1D0
-        call counters_smatrix1_stop()
         RETURN
       ENDIF
       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
@@ -353,8 +351,9 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         ENDIF
       ENDIF
       ANS=ANS/DBLE(IDEN)
+
       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
-      call counters_smatrix1_stop()
+
       END
 
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f
index 4f5f2bb65a..f0bf648d9b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f
@@ -571,7 +571,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
       
       IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 #endif
-        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
 !$OMP PARALLEL
 !$OMP DO
         DO IVEC=1, VECSIZE_USED
@@ -587,7 +587,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
         ENDDO
 !$OMP END DO
 !$OMP END PARALLEL
-        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
+        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
       ENDIF
 
@@ -597,9 +597,10 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           STOP
         ENDIF
         IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
+          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
           FIRST = .FALSE.
 c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
           IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -613,22 +614,23 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           ENDIF
           WRITE (6,*) 'NGOODHEL =', NGOODHEL
           WRITE (6,*) 'NCOMB =', NCOMB
+          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
         ENDIF
-        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
         IF ( .NOT. MULTI_CHANNEL ) THEN
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ELSE
           IF( SDE_STRAT.NE.1 ) THEN
             WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
             STOP
           ENDIF
-          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
      &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ENDIF
-        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
+        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
       ENDIF
 
       IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/fcheck_sa.f
index 32f6c3207c..6a66bac979 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/fcheck_sa.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f
index 3c33819612..ab5c2f5dcc 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f
@@ -194,7 +194,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 C     BEGIN CODE
 C     ----------
 
-      call counters_smatrix1_start()
       NTRY(IMIRROR)=NTRY(IMIRROR)+1
       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
       DO I=1,NEXTERNAL
@@ -300,7 +299,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         IHEL = HEL_PICKED
       ELSE
         ANS = 1D0
-        call counters_smatrix1_stop()
         RETURN
       ENDIF
       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
@@ -345,8 +343,9 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         ENDIF
       ENDIF
       ANS=ANS/DBLE(IDEN)
+
       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
-      call counters_smatrix1_stop()
+
       END
 
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f
index 598e4f55b8..e7b63d08c4 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f
@@ -571,7 +571,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
       
       IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 #endif
-        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
 !$OMP PARALLEL
 !$OMP DO
         DO IVEC=1, VECSIZE_USED
@@ -587,7 +587,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
         ENDDO
 !$OMP END DO
 !$OMP END PARALLEL
-        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
+        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
       ENDIF
 
@@ -597,9 +597,10 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           STOP
         ENDIF
         IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
+          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
           FIRST = .FALSE.
 c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
           IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -613,22 +614,23 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           ENDIF
           WRITE (6,*) 'NGOODHEL =', NGOODHEL
           WRITE (6,*) 'NCOMB =', NCOMB
+          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
         ENDIF
-        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
         IF ( .NOT. MULTI_CHANNEL ) THEN
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ELSE
           IF( SDE_STRAT.NE.1 ) THEN
             WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
             STOP
           ENDIF
-          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
      &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ENDIF
-        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
+        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
       ENDIF
 
       IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/fcheck_sa.f
index 32f6c3207c..6a66bac979 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/fcheck_sa.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f
index 485ad633d3..db949d4977 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f
@@ -194,7 +194,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 C     BEGIN CODE
 C     ----------
 
-      call counters_smatrix1_start()
       NTRY(IMIRROR)=NTRY(IMIRROR)+1
       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
       DO I=1,NEXTERNAL
@@ -300,7 +299,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         IHEL = HEL_PICKED
       ELSE
         ANS = 1D0
-        call counters_smatrix1_stop()
         RETURN
       ENDIF
       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
@@ -345,8 +343,9 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         ENDIF
       ENDIF
       ANS=ANS/DBLE(IDEN)
+
       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
-      call counters_smatrix1_stop()
+
       END
 
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f
index dd3cd5c8a4..765f218d09 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f
@@ -587,7 +587,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
       
       IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 #endif
-        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
 !$OMP PARALLEL
 !$OMP DO
         DO IVEC=1, VECSIZE_USED
@@ -603,7 +603,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
         ENDDO
 !$OMP END DO
 !$OMP END PARALLEL
-        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
+        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
       ENDIF
 
@@ -613,9 +613,10 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           STOP
         ENDIF
         IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
+          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
           FIRST = .FALSE.
 c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
           IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -629,22 +630,23 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           ENDIF
           WRITE (6,*) 'NGOODHEL =', NGOODHEL
           WRITE (6,*) 'NCOMB =', NCOMB
+          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
         ENDIF
-        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
         IF ( .NOT. MULTI_CHANNEL ) THEN
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ELSE
           IF( SDE_STRAT.NE.1 ) THEN
             WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
             STOP
           ENDIF
-          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
      &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ENDIF
-        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
+        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
       ENDIF
 
       IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/fcheck_sa.f
index 32f6c3207c..6a66bac979 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/fcheck_sa.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f
index 16d80c44b6..f921e966b9 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f
@@ -196,7 +196,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 C     BEGIN CODE
 C     ----------
 
-      call counters_smatrix1_start()
       NTRY(IMIRROR)=NTRY(IMIRROR)+1
       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
       DO I=1,NEXTERNAL
@@ -302,7 +301,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         IHEL = HEL_PICKED
       ELSE
         ANS = 1D0
-        call counters_smatrix1_stop()
         RETURN
       ENDIF
       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
@@ -347,8 +345,9 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         ENDIF
       ENDIF
       ANS=ANS/DBLE(IDEN)
+
       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
-      call counters_smatrix1_stop()
+
       END
 
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f
index ef5dde5b56..8284af5cac 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f
@@ -571,7 +571,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
       
       IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 #endif
-        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
 !$OMP PARALLEL
 !$OMP DO
         DO IVEC=1, VECSIZE_USED
@@ -587,7 +587,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
         ENDDO
 !$OMP END DO
 !$OMP END PARALLEL
-        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
+        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
       ENDIF
 
@@ -597,9 +597,10 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           STOP
         ENDIF
         IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
+          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
           FIRST = .FALSE.
 c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
           IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -613,22 +614,23 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           ENDIF
           WRITE (6,*) 'NGOODHEL =', NGOODHEL
           WRITE (6,*) 'NCOMB =', NCOMB
+          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
         ENDIF
-        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
         IF ( .NOT. MULTI_CHANNEL ) THEN
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ELSE
           IF( SDE_STRAT.NE.1 ) THEN
             WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
             STOP
           ENDIF
-          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
      &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ENDIF
-        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
+        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
       ENDIF
 
       IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/fcheck_sa.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/fcheck_sa.f
index 32f6c3207c..6a66bac979 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/fcheck_sa.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f
index 5510afb41e..c0df727705 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f
@@ -194,7 +194,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 C     BEGIN CODE
 C     ----------
 
-      call counters_smatrix1_start()
       NTRY(IMIRROR)=NTRY(IMIRROR)+1
       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
       DO I=1,NEXTERNAL
@@ -300,7 +299,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         IHEL = HEL_PICKED
       ELSE
         ANS = 1D0
-        call counters_smatrix1_stop()
         RETURN
       ENDIF
       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
@@ -345,8 +343,9 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         ENDIF
       ENDIF
       ANS=ANS/DBLE(IDEN)
+
       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
-      call counters_smatrix1_stop()
+
       END
 
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/counters.cc
index 742575a6a5..8ef58cce80 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/counters.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/counters.cc
@@ -21,26 +21,24 @@ extern "C"
 {
   // Now: fortran=-1, cudacpp=0
   // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
+  constexpr unsigned int nimplC = 3;
   constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
   const char* iimplC2TXT( int iimplC )
   {
     const int iimplF = iimplC - 1;
     switch( iimplF )
     {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
+      case -1: return "Fortran MEs"; break;
+      case +0: return "CudaCpp MEs"; break;
+      case +1: return "CudaCpp HEL"; break;
       default: assert( false ); break;
     }
   }
 
   static mgOnGpu::Timer<TIMERTYPE> program_timer;
   static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
   static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
   static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
   static int smatrix1multi_counter[nimplC] = { 0 };
 
   void counters_initialise_()
@@ -49,19 +47,6 @@ extern "C"
     return;
   }
 
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
   void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
   {
     const unsigned int iimplC = iimplF2C( *iimplF );
@@ -86,13 +71,23 @@ extern "C"
     printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
     printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
     for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
+    {
       if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+      {
+        if( iimplC < nimplC - 1 ) // MEs
+          printf( " [COUNTERS] %11s      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
+                  iimplC2TXT( iimplC ),
+                  iimplC + 1,
+                  smatrix1multi_totaltime[iimplC],
+                  smatrix1multi_counter[iimplC],
+                  smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+        else
+          printf( " [COUNTERS] %11s      ( %1d ) : %9.4fs\n",
+                  iimplC2TXT( iimplC ),
+                  iimplC + 1,
+                  smatrix1multi_totaltime[iimplC] );
+      }
+    }
     return;
   }
 }
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk
index 359f16c029..9cff5e1a60 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk
@@ -116,15 +116,31 @@ override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 # Set HIP_HOME from the path to hipcc, if it exists
 override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
 
-# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists
-# (FIXME? Is there any equivalent of NVTX FOR HIP? What should be configured if both CUDA and HIP are installed?)
-ifneq ($(CUDA_HOME),)
-  USE_NVTX ?=-DUSE_NVTX
-  CUDA_INC = -I$(CUDA_HOME)/include/
+# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
+ifeq ($(CUDA_HOME),)
+  # CUDA_HOME is empty (nvcc not found)
+  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/),)
+  # CUDA_HOME is defined (nvcc was found) but $(CUDA_HOME)/include/ does not exist?
+  override CUDA_INC=
 else
+  CUDA_INC = -I$(CUDA_HOME)/include/
+endif
+###$(info CUDA_INC=$(CUDA_INC))
+
+# Configure NVTX if a CUDA include directory exists and NVTX headers exist (see #965)
+ifeq ($(CUDA_INC),)
+  # $(CUDA_HOME)/include/ does not exist
   override USE_NVTX=
-  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/nvtx3/nvToolsExt.h),)
+  # $(CUDA_HOME)/include/ exists but NVTX headers do not exist?
+  override USE_NVTX=
+else
+  # $(CUDA_HOME)/include/nvtx.h exists: use NVTX
+  # (NB: the option to disable NVTX if 'USE_NVTX=' is defined has been removed)
+  override USE_NVTX=-DUSE_NVTX
 endif
+###$(info USE_NVTX=$(USE_NVTX))
 
 # NB: NEW LOGIC FOR ENABLING AND DISABLING CUDA OR HIP BUILDS (AV Feb-Mar 2024)
 # - In the old implementation, by default the C++ targets for one specific AVX were always built together with either CUDA or HIP.
@@ -424,13 +440,18 @@ endif
 # (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...)
 ifeq ($(HASCURAND),)
   ifeq ($(GPUCC),) # CPU-only build
-    ifneq ($(CUDA_HOME),)
+    ifeq ($(CUDA_INC),)
+      # $(CUDA_HOME)/include/ does not exist (see #965)
+      override HASCURAND = hasNoCurand
+    else ifeq ($(wildcard $(CUDA_HOME)/include/curand.h),)
+      # $(CUDA_HOME)/include/ exists but CURAND headers do not exist? (see #965)
+      override HASCURAND = hasNoCurand
+    else
       # By default, assume that curand is installed if a CUDA installation exists
       override HASCURAND = hasCurand
-    else
-      override HASCURAND = hasNoCurand
     endif
   else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    # By default, assume that curand is installed if a CUDA build is requested
     override HASCURAND = hasCurand
   else # non-Nvidia GPU build
     override HASCURAND = hasNoCurand
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc
index 8a5b8be9c0..99efcb1dbe 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc
@@ -83,6 +83,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_( CppObjectInFortran** ppbridge,
                          const FORTRANFPTYPE* momenta,
@@ -92,18 +93,20 @@ extern "C"
                          const unsigned int* pchannelId,
                          FORTRANFPTYPE* mes,
                          int* selhel,
-                         int* selcol )
+                         int* selcol,
+                         const bool* pgoodHelOnly )
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
+    //printf("fbridgesequence_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
 #ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
-    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #else
     // Use the host/CPU implementation in the C++ library
     // (there is no device implementation in this library)
-    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #endif
   }
 
@@ -119,6 +122,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
                                         const FORTRANFPTYPE* momenta,
@@ -127,9 +131,11 @@ extern "C"
                                         const FORTRANFPTYPE* rndcol,
                                         FORTRANFPTYPE* mes,
                                         int* selhel,
-                                        int* selcol )
+                                        int* selcol,
+                                        const bool* pgoodHelOnly )
   {
-    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol );
+    //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
+    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly );
   }
 
   /**
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.inc
index 422aa67cf9..a28622cdb6 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.inc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.inc
@@ -40,10 +40,11 @@ C - CHANID:  the input Feynman diagram to enhance in multi-channel mode if 1 to
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -53,6 +54,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE
       END INTERFACE
 
@@ -66,10 +68,11 @@ C - RNDCOL:  the input random number Fortran array for color selection
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -78,6 +81,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL
       END INTERFACE
 
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt
index fe284c1cc5..d55f30f145 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt
@@ -77,7 +77,7 @@ INFO: load vertices
 [1;32mDEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1) [0m
 [1;32mDEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3) [0m
 [1;32mDEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1) [0m
-[1;32mDEBUG: model prefixing  takes 0.13704657554626465 [0m
+[1;32mDEBUG: model prefixing  takes 0.13804030418395996 [0m
 INFO: Change particles name to pass to MG5 convention 
 Defined multiparticle p = g u c d s u~ c~ d~ s~
 Defined multiparticle j = g u c d s u~ c~ d~ s~
@@ -92,7 +92,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 
 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1  
 INFO: Process has 72 diagrams 
-1 processes with 72 diagrams generated in 3.722 s
+1 processes with 72 diagrams generated in 3.673 s
 Total: 1 processes with 72 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_smeft_gg_tttt --hel_recycling=False --vector_size=32
 Load PLUGIN.CUDACPP_OUTPUT
@@ -115,7 +115,7 @@ INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1
 INFO: Processing color information for process: g g > t t~ t t~ @1 
 INFO: Creating files in directory P1_gg_ttxttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1152][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7efe04f1dfa0> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f22e51fafa0> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -134,22 +134,22 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttxttx
 [1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 70 [1;30m[model_handling.py at line 1520][0m [0m
 [1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 68, 68: 69, 69: 71, 70: 72} [1;30m[model_handling.py at line 1544][0m [0m
 [1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 68: 67, 69: 68, 71: 69, 72: 70} [1;30m[model_handling.py at line 1545][0m [0m
-Generated helas calls for 1 subprocesses (72 diagrams) in 0.187 s
-Wrote files for 119 helas calls in 0.437 s
+Generated helas calls for 1 subprocesses (72 diagrams) in 0.185 s
+Wrote files for 119 helas calls in 0.432 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV5 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV9 routines[0m
 ALOHA: aloha creates VVVV10 routines[0m
-ALOHA: aloha creates 5 routines in  0.318 s
+ALOHA: aloha creates 5 routines in  0.317 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV5 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV9 routines[0m
 ALOHA: aloha creates VVVV10 routines[0m
-ALOHA: aloha creates 10 routines in  0.329 s
+ALOHA: aloha creates 10 routines in  0.333 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV5
 <class 'aloha.create_aloha.AbstractRoutine'> VVV5
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -193,10 +193,10 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m7.228s
-user	0m6.896s
-sys	0m0.299s
-Code generation completed in 8 seconds
+real	0m7.220s
+user	0m6.848s
+sys	0m0.283s
+Code generation completed in 7 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h
index 4bf2198dd1..60eb101a6a 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h
@@ -109,9 +109,9 @@ namespace mg5amcCpu
      * @param rndcol the pointer to the input random numbers for color selection
      * @param channelId the Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0)
      * @param mes the pointer to the output matrix elements
-     * @param goodHelOnly quit after computing good helicities?
      * @param selhel the pointer to the output selected helicities
      * @param selcol the pointer to the output selected colors
+     * @param goodHelOnly quit after computing good helicities?
      */
     void gpu_sequence( const FORTRANFPTYPE* momenta,
                        const FORTRANFPTYPE* gs,
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig1.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig1.f
index 86efacfe7f..461cfa8224 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig1.f
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig1.f
@@ -516,7 +516,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
       
       IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 #endif
-        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
 !$OMP PARALLEL
 !$OMP DO
         DO IVEC=1, VECSIZE_USED
@@ -532,7 +532,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
         ENDDO
 !$OMP END DO
 !$OMP END PARALLEL
-        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
+        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
       ENDIF
 
@@ -542,9 +542,10 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           STOP
         ENDIF
         IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
+          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
           FIRST = .FALSE.
 c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
           IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -558,22 +559,23 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           ENDIF
           WRITE (6,*) 'NGOODHEL =', NGOODHEL
           WRITE (6,*) 'NCOMB =', NCOMB
+          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
         ENDIF
-        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
         IF ( .NOT. MULTI_CHANNEL ) THEN
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ELSE
           IF( SDE_STRAT.NE.1 ) THEN
             WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
             STOP
           ENDIF
-          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
      &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ENDIF
-        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
+        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
       ENDIF
 
       IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/fcheck_sa.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/fcheck_sa.f
index 32f6c3207c..6a66bac979 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/fcheck_sa.f
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/matrix1.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/matrix1.f
index 4d5cb63761..d96ba556c5 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/matrix1.f
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/matrix1.f
@@ -191,7 +191,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 C     BEGIN CODE
 C     ----------
 
-      call counters_smatrix1_start()
       NTRY(IMIRROR)=NTRY(IMIRROR)+1
       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
       DO I=1,NEXTERNAL
@@ -297,7 +296,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         IHEL = HEL_PICKED
       ELSE
         ANS = 1D0
-        call counters_smatrix1_stop()
         RETURN
       ENDIF
       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
@@ -342,8 +340,9 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         ENDIF
       ENDIF
       ANS=ANS/DBLE(IDEN)
+
       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
-      call counters_smatrix1_stop()
+
       END
 
 
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/counters.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/counters.cc
index 742575a6a5..8ef58cce80 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/counters.cc
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/counters.cc
@@ -21,26 +21,24 @@ extern "C"
 {
   // Now: fortran=-1, cudacpp=0
   // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
+  constexpr unsigned int nimplC = 3;
   constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
   const char* iimplC2TXT( int iimplC )
   {
     const int iimplF = iimplC - 1;
     switch( iimplF )
     {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
+      case -1: return "Fortran MEs"; break;
+      case +0: return "CudaCpp MEs"; break;
+      case +1: return "CudaCpp HEL"; break;
       default: assert( false ); break;
     }
   }
 
   static mgOnGpu::Timer<TIMERTYPE> program_timer;
   static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
   static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
   static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
   static int smatrix1multi_counter[nimplC] = { 0 };
 
   void counters_initialise_()
@@ -49,19 +47,6 @@ extern "C"
     return;
   }
 
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
   void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
   {
     const unsigned int iimplC = iimplF2C( *iimplF );
@@ -86,13 +71,23 @@ extern "C"
     printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
     printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
     for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
+    {
       if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+      {
+        if( iimplC < nimplC - 1 ) // MEs
+          printf( " [COUNTERS] %11s      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
+                  iimplC2TXT( iimplC ),
+                  iimplC + 1,
+                  smatrix1multi_totaltime[iimplC],
+                  smatrix1multi_counter[iimplC],
+                  smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+        else
+          printf( " [COUNTERS] %11s      ( %1d ) : %9.4fs\n",
+                  iimplC2TXT( iimplC ),
+                  iimplC + 1,
+                  smatrix1multi_totaltime[iimplC] );
+      }
+    }
     return;
   }
 }
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk
index 359f16c029..9cff5e1a60 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk
@@ -116,15 +116,31 @@ override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 # Set HIP_HOME from the path to hipcc, if it exists
 override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
 
-# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists
-# (FIXME? Is there any equivalent of NVTX FOR HIP? What should be configured if both CUDA and HIP are installed?)
-ifneq ($(CUDA_HOME),)
-  USE_NVTX ?=-DUSE_NVTX
-  CUDA_INC = -I$(CUDA_HOME)/include/
+# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
+ifeq ($(CUDA_HOME),)
+  # CUDA_HOME is empty (nvcc not found)
+  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/),)
+  # CUDA_HOME is defined (nvcc was found) but $(CUDA_HOME)/include/ does not exist?
+  override CUDA_INC=
 else
+  CUDA_INC = -I$(CUDA_HOME)/include/
+endif
+###$(info CUDA_INC=$(CUDA_INC))
+
+# Configure NVTX if a CUDA include directory exists and NVTX headers exist (see #965)
+ifeq ($(CUDA_INC),)
+  # $(CUDA_HOME)/include/ does not exist
   override USE_NVTX=
-  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/nvtx3/nvToolsExt.h),)
+  # $(CUDA_HOME)/include/ exists but NVTX headers do not exist?
+  override USE_NVTX=
+else
+  # $(CUDA_HOME)/include/nvtx.h exists: use NVTX
+  # (NB: the option to disable NVTX if 'USE_NVTX=' is defined has been removed)
+  override USE_NVTX=-DUSE_NVTX
 endif
+###$(info USE_NVTX=$(USE_NVTX))
 
 # NB: NEW LOGIC FOR ENABLING AND DISABLING CUDA OR HIP BUILDS (AV Feb-Mar 2024)
 # - In the old implementation, by default the C++ targets for one specific AVX were always built together with either CUDA or HIP.
@@ -424,13 +440,18 @@ endif
 # (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...)
 ifeq ($(HASCURAND),)
   ifeq ($(GPUCC),) # CPU-only build
-    ifneq ($(CUDA_HOME),)
+    ifeq ($(CUDA_INC),)
+      # $(CUDA_HOME)/include/ does not exist (see #965)
+      override HASCURAND = hasNoCurand
+    else ifeq ($(wildcard $(CUDA_HOME)/include/curand.h),)
+      # $(CUDA_HOME)/include/ exists but CURAND headers do not exist? (see #965)
+      override HASCURAND = hasNoCurand
+    else
       # By default, assume that curand is installed if a CUDA installation exists
       override HASCURAND = hasCurand
-    else
-      override HASCURAND = hasNoCurand
     endif
   else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    # By default, assume that curand is installed if a CUDA build is requested
     override HASCURAND = hasCurand
   else # non-Nvidia GPU build
     override HASCURAND = hasNoCurand
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.cc
index 8a5b8be9c0..99efcb1dbe 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.cc
@@ -83,6 +83,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_( CppObjectInFortran** ppbridge,
                          const FORTRANFPTYPE* momenta,
@@ -92,18 +93,20 @@ extern "C"
                          const unsigned int* pchannelId,
                          FORTRANFPTYPE* mes,
                          int* selhel,
-                         int* selcol )
+                         int* selcol,
+                         const bool* pgoodHelOnly )
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
+    //printf("fbridgesequence_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
 #ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
-    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #else
     // Use the host/CPU implementation in the C++ library
     // (there is no device implementation in this library)
-    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #endif
   }
 
@@ -119,6 +122,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
                                         const FORTRANFPTYPE* momenta,
@@ -127,9 +131,11 @@ extern "C"
                                         const FORTRANFPTYPE* rndcol,
                                         FORTRANFPTYPE* mes,
                                         int* selhel,
-                                        int* selcol )
+                                        int* selcol,
+                                        const bool* pgoodHelOnly )
   {
-    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol );
+    //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
+    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly );
   }
 
   /**
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.inc
index 422aa67cf9..a28622cdb6 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.inc
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/fbridge.inc
@@ -40,10 +40,11 @@ C - CHANID:  the input Feynman diagram to enhance in multi-channel mode if 1 to
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -53,6 +54,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE
       END INTERFACE
 
@@ -66,10 +68,11 @@ C - RNDCOL:  the input random number Fortran array for color selection
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -78,6 +81,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL
       END INTERFACE
 
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt
index 62d7042d00..4fb7228286 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt
@@ -77,7 +77,7 @@ INFO: load vertices
 [1;32mDEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1) [0m
 [1;32mDEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3) [0m
 [1;32mDEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1) [0m
-[1;32mDEBUG: model prefixing  takes 0.13954997062683105 [0m
+[1;32mDEBUG: model prefixing  takes 0.13859224319458008 [0m
 INFO: Change particles name to pass to MG5 convention 
 Defined multiparticle p = g u c d s u~ c~ d~ s~
 Defined multiparticle j = g u c d s u~ c~ d~ s~
@@ -92,7 +92,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 
 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1  
 INFO: Process has 72 diagrams 
-1 processes with 72 diagrams generated in 3.727 s
+1 processes with 72 diagrams generated in 3.821 s
 Total: 1 processes with 72 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt
 Load PLUGIN.CUDACPP_OUTPUT
@@ -122,7 +122,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV9 routines[0m
 ALOHA: aloha creates VVVV10 routines[0m
-ALOHA: aloha creates 5 routines in  0.322 s
+ALOHA: aloha creates 5 routines in  0.316 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV5
 <class 'aloha.create_aloha.AbstractRoutine'> VVV5
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -142,7 +142,7 @@ INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SME
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. 
 quit
 
-real	0m5.137s
-user	0m5.030s
-sys	0m0.072s
+real	0m5.206s
+user	0m5.107s
+sys	0m0.076s
 Code generation completed in 5 seconds
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h
index 4bf2198dd1..60eb101a6a 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h
@@ -109,9 +109,9 @@ namespace mg5amcCpu
      * @param rndcol the pointer to the input random numbers for color selection
      * @param channelId the Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0)
      * @param mes the pointer to the output matrix elements
-     * @param goodHelOnly quit after computing good helicities?
      * @param selhel the pointer to the output selected helicities
      * @param selcol the pointer to the output selected colors
+     * @param goodHelOnly quit after computing good helicities?
      */
     void gpu_sequence( const FORTRANFPTYPE* momenta,
                        const FORTRANFPTYPE* gs,
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fcheck_sa.f b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fcheck_sa.f
index 32f6c3207c..6a66bac979 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fcheck_sa.f
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk
index 359f16c029..9cff5e1a60 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk
@@ -116,15 +116,31 @@ override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 # Set HIP_HOME from the path to hipcc, if it exists
 override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
 
-# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists
-# (FIXME? Is there any equivalent of NVTX FOR HIP? What should be configured if both CUDA and HIP are installed?)
-ifneq ($(CUDA_HOME),)
-  USE_NVTX ?=-DUSE_NVTX
-  CUDA_INC = -I$(CUDA_HOME)/include/
+# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
+ifeq ($(CUDA_HOME),)
+  # CUDA_HOME is empty (nvcc not found)
+  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/),)
+  # CUDA_HOME is defined (nvcc was found) but $(CUDA_HOME)/include/ does not exist?
+  override CUDA_INC=
 else
+  CUDA_INC = -I$(CUDA_HOME)/include/
+endif
+###$(info CUDA_INC=$(CUDA_INC))
+
+# Configure NVTX if a CUDA include directory exists and NVTX headers exist (see #965)
+ifeq ($(CUDA_INC),)
+  # $(CUDA_HOME)/include/ does not exist
   override USE_NVTX=
-  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/nvtx3/nvToolsExt.h),)
+  # $(CUDA_HOME)/include/ exists but NVTX headers do not exist?
+  override USE_NVTX=
+else
+  # $(CUDA_HOME)/include/nvtx.h exists: use NVTX
+  # (NB: the option to disable NVTX if 'USE_NVTX=' is defined has been removed)
+  override USE_NVTX=-DUSE_NVTX
 endif
+###$(info USE_NVTX=$(USE_NVTX))
 
 # NB: NEW LOGIC FOR ENABLING AND DISABLING CUDA OR HIP BUILDS (AV Feb-Mar 2024)
 # - In the old implementation, by default the C++ targets for one specific AVX were always built together with either CUDA or HIP.
@@ -424,13 +440,18 @@ endif
 # (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...)
 ifeq ($(HASCURAND),)
   ifeq ($(GPUCC),) # CPU-only build
-    ifneq ($(CUDA_HOME),)
+    ifeq ($(CUDA_INC),)
+      # $(CUDA_HOME)/include/ does not exist (see #965)
+      override HASCURAND = hasNoCurand
+    else ifeq ($(wildcard $(CUDA_HOME)/include/curand.h),)
+      # $(CUDA_HOME)/include/ exists but CURAND headers do not exist? (see #965)
+      override HASCURAND = hasNoCurand
+    else
       # By default, assume that curand is installed if a CUDA installation exists
       override HASCURAND = hasCurand
-    else
-      override HASCURAND = hasNoCurand
     endif
   else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    # By default, assume that curand is installed if a CUDA build is requested
     override HASCURAND = hasCurand
   else # non-Nvidia GPU build
     override HASCURAND = hasNoCurand
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.cc
index 8a5b8be9c0..99efcb1dbe 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.cc
@@ -83,6 +83,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_( CppObjectInFortran** ppbridge,
                          const FORTRANFPTYPE* momenta,
@@ -92,18 +93,20 @@ extern "C"
                          const unsigned int* pchannelId,
                          FORTRANFPTYPE* mes,
                          int* selhel,
-                         int* selcol )
+                         int* selcol,
+                         const bool* pgoodHelOnly )
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
+    //printf("fbridgesequence_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
 #ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
-    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #else
     // Use the host/CPU implementation in the C++ library
     // (there is no device implementation in this library)
-    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #endif
   }
 
@@ -119,6 +122,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
                                         const FORTRANFPTYPE* momenta,
@@ -127,9 +131,11 @@ extern "C"
                                         const FORTRANFPTYPE* rndcol,
                                         FORTRANFPTYPE* mes,
                                         int* selhel,
-                                        int* selcol )
+                                        int* selcol,
+                                        const bool* pgoodHelOnly )
   {
-    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol );
+    //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
+    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly );
   }
 
   /**
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.inc
index 422aa67cf9..a28622cdb6 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.inc
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.inc
@@ -40,10 +40,11 @@ C - CHANID:  the input Feynman diagram to enhance in multi-channel mode if 1 to
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -53,6 +54,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE
       END INTERFACE
 
@@ -66,10 +68,11 @@ C - RNDCOL:  the input random number Fortran array for color selection
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -78,6 +81,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL
       END INTERFACE
 
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt
index 37089500b4..49e61427c5 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt
@@ -554,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1  
 INFO: Process has 6 diagrams 
-1 processes with 6 diagrams generated in 0.125 s
+1 processes with 6 diagrams generated in 0.130 s
 Total: 1 processes with 6 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_t1t1 --hel_recycling=False --vector_size=32
 Load PLUGIN.CUDACPP_OUTPUT
@@ -577,7 +577,7 @@ INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t1 t1~ @1 
 INFO: Creating files in directory P1_gg_t1t1x 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1152][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fc4830d31c0> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1bae241100> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -597,7 +597,7 @@ INFO: Finding symmetric diagrams for subprocess group gg_t1t1x
 [1;32mDEBUG:  iconfig_to_diag = [0m {1: 2, 2: 3, 3: 4, 4: 5, 5: 6} [1;30m[model_handling.py at line 1544][0m [0m
 [1;32mDEBUG:  diag_to_iconfig = [0m {2: 1, 3: 2, 4: 3, 5: 4, 6: 5} [1;30m[model_handling.py at line 1545][0m [0m
 Generated helas calls for 1 subprocesses (6 diagrams) in 0.008 s
-Wrote files for 16 helas calls in 0.125 s
+Wrote files for 16 helas calls in 0.126 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VSS1 routines[0m
@@ -607,7 +607,7 @@ ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VSS1 routines[0m
 ALOHA: aloha creates VVSS1 routines[0m
-ALOHA: aloha creates 6 routines in  0.180 s
+ALOHA: aloha creates 6 routines in  0.182 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VSS1
 <class 'aloha.create_aloha.AbstractRoutine'> VSS1
@@ -647,10 +647,10 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m3.044s
-user	0m2.726s
-sys	0m0.300s
-Code generation completed in 3 seconds
+real	0m3.278s
+user	0m2.733s
+sys	0m0.284s
+Code generation completed in 4 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h
index 4bf2198dd1..60eb101a6a 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h
@@ -109,9 +109,9 @@ namespace mg5amcCpu
      * @param rndcol the pointer to the input random numbers for color selection
      * @param channelId the Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0)
      * @param mes the pointer to the output matrix elements
-     * @param goodHelOnly quit after computing good helicities?
      * @param selhel the pointer to the output selected helicities
      * @param selcol the pointer to the output selected colors
+     * @param goodHelOnly quit after computing good helicities?
      */
     void gpu_sequence( const FORTRANFPTYPE* momenta,
                        const FORTRANFPTYPE* gs,
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig1.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig1.f
index 69a8372b3e..0170f78a25 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig1.f
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig1.f
@@ -516,7 +516,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
       
       IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 #endif
-        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
 !$OMP PARALLEL
 !$OMP DO
         DO IVEC=1, VECSIZE_USED
@@ -532,7 +532,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
         ENDDO
 !$OMP END DO
 !$OMP END PARALLEL
-        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
+        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
       ENDIF
 
@@ -542,9 +542,10 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           STOP
         ENDIF
         IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
+          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
           FIRST = .FALSE.
 c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
           IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -558,22 +559,23 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           ENDIF
           WRITE (6,*) 'NGOODHEL =', NGOODHEL
           WRITE (6,*) 'NCOMB =', NCOMB
+          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
         ENDIF
-        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
         IF ( .NOT. MULTI_CHANNEL ) THEN
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ELSE
           IF( SDE_STRAT.NE.1 ) THEN
             WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
             STOP
           ENDIF
-          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
      &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ENDIF
-        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
+        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
       ENDIF
 
       IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/fcheck_sa.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/fcheck_sa.f
index 37d586be72..f0220047d7 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/fcheck_sa.f
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix1.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix1.f
index b1f74c86e4..bfb95cf2ee 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix1.f
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix1.f
@@ -131,7 +131,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 C     BEGIN CODE
 C     ----------
 
-      call counters_smatrix1_start()
       NTRY(IMIRROR)=NTRY(IMIRROR)+1
       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
       DO I=1,NEXTERNAL
@@ -237,7 +236,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         IHEL = HEL_PICKED
       ELSE
         ANS = 1D0
-        call counters_smatrix1_stop()
         RETURN
       ENDIF
       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
@@ -282,8 +280,9 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         ENDIF
       ENDIF
       ANS=ANS/DBLE(IDEN)
+
       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
-      call counters_smatrix1_stop()
+
       END
 
 
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/counters.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/counters.cc
index 742575a6a5..8ef58cce80 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/counters.cc
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/counters.cc
@@ -21,26 +21,24 @@ extern "C"
 {
   // Now: fortran=-1, cudacpp=0
   // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
+  constexpr unsigned int nimplC = 3;
   constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
   const char* iimplC2TXT( int iimplC )
   {
     const int iimplF = iimplC - 1;
     switch( iimplF )
     {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
+      case -1: return "Fortran MEs"; break;
+      case +0: return "CudaCpp MEs"; break;
+      case +1: return "CudaCpp HEL"; break;
       default: assert( false ); break;
     }
   }
 
   static mgOnGpu::Timer<TIMERTYPE> program_timer;
   static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
   static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
   static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
   static int smatrix1multi_counter[nimplC] = { 0 };
 
   void counters_initialise_()
@@ -49,19 +47,6 @@ extern "C"
     return;
   }
 
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
   void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
   {
     const unsigned int iimplC = iimplF2C( *iimplF );
@@ -86,13 +71,23 @@ extern "C"
     printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
     printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
     for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
+    {
       if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+      {
+        if( iimplC < nimplC - 1 ) // MEs
+          printf( " [COUNTERS] %11s      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
+                  iimplC2TXT( iimplC ),
+                  iimplC + 1,
+                  smatrix1multi_totaltime[iimplC],
+                  smatrix1multi_counter[iimplC],
+                  smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+        else
+          printf( " [COUNTERS] %11s      ( %1d ) : %9.4fs\n",
+                  iimplC2TXT( iimplC ),
+                  iimplC + 1,
+                  smatrix1multi_totaltime[iimplC] );
+      }
+    }
     return;
   }
 }
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk
index 359f16c029..9cff5e1a60 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk
@@ -116,15 +116,31 @@ override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 # Set HIP_HOME from the path to hipcc, if it exists
 override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
 
-# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists
-# (FIXME? Is there any equivalent of NVTX FOR HIP? What should be configured if both CUDA and HIP are installed?)
-ifneq ($(CUDA_HOME),)
-  USE_NVTX ?=-DUSE_NVTX
-  CUDA_INC = -I$(CUDA_HOME)/include/
+# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
+ifeq ($(CUDA_HOME),)
+  # CUDA_HOME is empty (nvcc not found)
+  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/),)
+  # CUDA_HOME is defined (nvcc was found) but $(CUDA_HOME)/include/ does not exist?
+  override CUDA_INC=
 else
+  CUDA_INC = -I$(CUDA_HOME)/include/
+endif
+###$(info CUDA_INC=$(CUDA_INC))
+
+# Configure NVTX if a CUDA include directory exists and NVTX headers exist (see #965)
+ifeq ($(CUDA_INC),)
+  # $(CUDA_HOME)/include/ does not exist
   override USE_NVTX=
-  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/nvtx3/nvToolsExt.h),)
+  # $(CUDA_HOME)/include/ exists but NVTX headers do not exist?
+  override USE_NVTX=
+else
+  # $(CUDA_HOME)/include/nvtx.h exists: use NVTX
+  # (NB: the option to disable NVTX if 'USE_NVTX=' is defined has been removed)
+  override USE_NVTX=-DUSE_NVTX
 endif
+###$(info USE_NVTX=$(USE_NVTX))
 
 # NB: NEW LOGIC FOR ENABLING AND DISABLING CUDA OR HIP BUILDS (AV Feb-Mar 2024)
 # - In the old implementation, by default the C++ targets for one specific AVX were always built together with either CUDA or HIP.
@@ -424,13 +440,18 @@ endif
 # (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...)
 ifeq ($(HASCURAND),)
   ifeq ($(GPUCC),) # CPU-only build
-    ifneq ($(CUDA_HOME),)
+    ifeq ($(CUDA_INC),)
+      # $(CUDA_HOME)/include/ does not exist (see #965)
+      override HASCURAND = hasNoCurand
+    else ifeq ($(wildcard $(CUDA_HOME)/include/curand.h),)
+      # $(CUDA_HOME)/include/ exists but CURAND headers do not exist? (see #965)
+      override HASCURAND = hasNoCurand
+    else
       # By default, assume that curand is installed if a CUDA installation exists
       override HASCURAND = hasCurand
-    else
-      override HASCURAND = hasNoCurand
     endif
   else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    # By default, assume that curand is installed if a CUDA build is requested
     override HASCURAND = hasCurand
   else # non-Nvidia GPU build
     override HASCURAND = hasNoCurand
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.cc
index 8a5b8be9c0..99efcb1dbe 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.cc
@@ -83,6 +83,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_( CppObjectInFortran** ppbridge,
                          const FORTRANFPTYPE* momenta,
@@ -92,18 +93,20 @@ extern "C"
                          const unsigned int* pchannelId,
                          FORTRANFPTYPE* mes,
                          int* selhel,
-                         int* selcol )
+                         int* selcol,
+                         const bool* pgoodHelOnly )
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
+    //printf("fbridgesequence_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
 #ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
-    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #else
     // Use the host/CPU implementation in the C++ library
     // (there is no device implementation in this library)
-    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #endif
   }
 
@@ -119,6 +122,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
                                         const FORTRANFPTYPE* momenta,
@@ -127,9 +131,11 @@ extern "C"
                                         const FORTRANFPTYPE* rndcol,
                                         FORTRANFPTYPE* mes,
                                         int* selhel,
-                                        int* selcol )
+                                        int* selcol,
+                                        const bool* pgoodHelOnly )
   {
-    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol );
+    //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
+    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly );
   }
 
   /**
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.inc
index 422aa67cf9..a28622cdb6 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.inc
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/fbridge.inc
@@ -40,10 +40,11 @@ C - CHANID:  the input Feynman diagram to enhance in multi-channel mode if 1 to
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -53,6 +54,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE
       END INTERFACE
 
@@ -66,10 +68,11 @@ C - RNDCOL:  the input random number Fortran array for color selection
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -78,6 +81,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL
       END INTERFACE
 
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt
index c2f899fe3e..1085728e17 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt
@@ -582,7 +582,7 @@ ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VSS1 routines[0m
 ALOHA: aloha creates VVSS1 routines[0m
-ALOHA: aloha creates 3 routines in  0.185 s
+ALOHA: aloha creates 3 routines in  0.183 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VSS1
 <class 'aloha.create_aloha.AbstractRoutine'> VSS1
@@ -598,7 +598,7 @@ INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in dire
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. 
 quit
 
-real	0m1.342s
-user	0m1.248s
-sys	0m0.080s
+real	0m1.401s
+user	0m1.286s
+sys	0m0.057s
 Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h
index 4bf2198dd1..60eb101a6a 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h
@@ -109,9 +109,9 @@ namespace mg5amcCpu
      * @param rndcol the pointer to the input random numbers for color selection
      * @param channelId the Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0)
      * @param mes the pointer to the output matrix elements
-     * @param goodHelOnly quit after computing good helicities?
      * @param selhel the pointer to the output selected helicities
      * @param selcol the pointer to the output selected colors
+     * @param goodHelOnly quit after computing good helicities?
      */
     void gpu_sequence( const FORTRANFPTYPE* momenta,
                        const FORTRANFPTYPE* gs,
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/fcheck_sa.f b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/fcheck_sa.f
index 37d586be72..f0220047d7 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/fcheck_sa.f
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk
index 359f16c029..9cff5e1a60 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk
@@ -116,15 +116,31 @@ override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 # Set HIP_HOME from the path to hipcc, if it exists
 override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
 
-# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists
-# (FIXME? Is there any equivalent of NVTX FOR HIP? What should be configured if both CUDA and HIP are installed?)
-ifneq ($(CUDA_HOME),)
-  USE_NVTX ?=-DUSE_NVTX
-  CUDA_INC = -I$(CUDA_HOME)/include/
+# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
+ifeq ($(CUDA_HOME),)
+  # CUDA_HOME is empty (nvcc not found)
+  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/),)
+  # CUDA_HOME is defined (nvcc was found) but $(CUDA_HOME)/include/ does not exist?
+  override CUDA_INC=
 else
+  CUDA_INC = -I$(CUDA_HOME)/include/
+endif
+###$(info CUDA_INC=$(CUDA_INC))
+
+# Configure NVTX if a CUDA include directory exists and NVTX headers exist (see #965)
+ifeq ($(CUDA_INC),)
+  # $(CUDA_HOME)/include/ does not exist
   override USE_NVTX=
-  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/nvtx3/nvToolsExt.h),)
+  # $(CUDA_HOME)/include/ exists but NVTX headers do not exist?
+  override USE_NVTX=
+else
+  # $(CUDA_HOME)/include/nvtx.h exists: use NVTX
+  # (NB: the option to disable NVTX if 'USE_NVTX=' is defined has been removed)
+  override USE_NVTX=-DUSE_NVTX
 endif
+###$(info USE_NVTX=$(USE_NVTX))
 
 # NB: NEW LOGIC FOR ENABLING AND DISABLING CUDA OR HIP BUILDS (AV Feb-Mar 2024)
 # - In the old implementation, by default the C++ targets for one specific AVX were always built together with either CUDA or HIP.
@@ -424,13 +440,18 @@ endif
 # (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...)
 ifeq ($(HASCURAND),)
   ifeq ($(GPUCC),) # CPU-only build
-    ifneq ($(CUDA_HOME),)
+    ifeq ($(CUDA_INC),)
+      # $(CUDA_HOME)/include/ does not exist (see #965)
+      override HASCURAND = hasNoCurand
+    else ifeq ($(wildcard $(CUDA_HOME)/include/curand.h),)
+      # $(CUDA_HOME)/include/ exists but CURAND headers do not exist? (see #965)
+      override HASCURAND = hasNoCurand
+    else
       # By default, assume that curand is installed if a CUDA installation exists
       override HASCURAND = hasCurand
-    else
-      override HASCURAND = hasNoCurand
     endif
   else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    # By default, assume that curand is installed if a CUDA build is requested
     override HASCURAND = hasCurand
   else # non-Nvidia GPU build
     override HASCURAND = hasNoCurand
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.cc
index 8a5b8be9c0..99efcb1dbe 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.cc
@@ -83,6 +83,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_( CppObjectInFortran** ppbridge,
                          const FORTRANFPTYPE* momenta,
@@ -92,18 +93,20 @@ extern "C"
                          const unsigned int* pchannelId,
                          FORTRANFPTYPE* mes,
                          int* selhel,
-                         int* selcol )
+                         int* selcol,
+                         const bool* pgoodHelOnly )
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
+    //printf("fbridgesequence_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
 #ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
-    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #else
     // Use the host/CPU implementation in the C++ library
     // (there is no device implementation in this library)
-    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #endif
   }
 
@@ -119,6 +122,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
                                         const FORTRANFPTYPE* momenta,
@@ -127,9 +131,11 @@ extern "C"
                                         const FORTRANFPTYPE* rndcol,
                                         FORTRANFPTYPE* mes,
                                         int* selhel,
-                                        int* selcol )
+                                        int* selcol,
+                                        const bool* pgoodHelOnly )
   {
-    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol );
+    //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
+    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly );
   }
 
   /**
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.inc
index 422aa67cf9..a28622cdb6 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.inc
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/fbridge.inc
@@ -40,10 +40,11 @@ C - CHANID:  the input Feynman diagram to enhance in multi-channel mode if 1 to
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -53,6 +54,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE
       END INTERFACE
 
@@ -66,10 +68,11 @@ C - RNDCOL:  the input random number Fortran array for color selection
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -78,6 +81,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL
       END INTERFACE
 
diff --git a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt
index 4f86b653e0..a1082c61f1 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt
+++ b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt
@@ -577,7 +577,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1152][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f5ecd82c850> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f4cda57b040> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -597,15 +597,15 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttx
 [1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1544][0m [0m
 [1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1545][0m [0m
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-Wrote files for 10 helas calls in 0.119 s
+Wrote files for 10 helas calls in 0.116 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.137 s
+ALOHA: aloha creates 2 routines in  0.139 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.136 s
+ALOHA: aloha creates 4 routines in  0.135 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -640,10 +640,10 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m3.851s
-user	0m2.590s
-sys	0m0.316s
-Code generation completed in 4 seconds
+real	0m2.872s
+user	0m2.564s
+sys	0m0.301s
+Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h
index 4bf2198dd1..60eb101a6a 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h
@@ -109,9 +109,9 @@ namespace mg5amcCpu
      * @param rndcol the pointer to the input random numbers for color selection
      * @param channelId the Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0)
      * @param mes the pointer to the output matrix elements
-     * @param goodHelOnly quit after computing good helicities?
      * @param selhel the pointer to the output selected helicities
      * @param selcol the pointer to the output selected colors
+     * @param goodHelOnly quit after computing good helicities?
      */
     void gpu_sequence( const FORTRANFPTYPE* momenta,
                        const FORTRANFPTYPE* gs,
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
index f9e2335de4..d5accb9fb2 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
@@ -516,7 +516,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
       
       IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
 #endif
-        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortranMEs=-1
 !$OMP PARALLEL
 !$OMP DO
         DO IVEC=1, VECSIZE_USED
@@ -532,7 +532,7 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
         ENDDO
 !$OMP END DO
 !$OMP END PARALLEL
-        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
+        call counters_smatrix1multi_stop( -1 ) ! fortranMEs=-1
 #ifdef MG5AMC_MEEXPORTER_CUDACPP
       ENDIF
 
@@ -542,9 +542,10 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           STOP
         ENDIF
         IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
+          call counters_smatrix1multi_start( 1, VECSIZE_USED ) ! cudacppHEL=1
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .TRUE.) ! quit after computing helicities
           FIRST = .FALSE.
 c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
           IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
@@ -558,22 +559,23 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
           ENDIF
           WRITE (6,*) 'NGOODHEL =', NGOODHEL
           WRITE (6,*) 'NCOMB =', NCOMB
+          call counters_smatrix1multi_stop( 1 ) ! cudacppHEL=1
         ENDIF
-        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacppMEs=0
         IF ( .NOT. MULTI_CHANNEL ) THEN
           CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
      &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 )
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ELSE
           IF( SDE_STRAT.NE.1 ) THEN
             WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
             STOP
           ENDIF
-          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G, ! multi channel enabled
      &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
+     &      SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
         ENDIF
-        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
+        call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0
       ENDIF
 
       IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f
index 37d586be72..f0220047d7 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
index bc79ed4217..aa332cd578 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
@@ -143,7 +143,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 C     BEGIN CODE
 C     ----------
 
-      call counters_smatrix1_start()
       NTRY(IMIRROR)=NTRY(IMIRROR)+1
       THIS_NTRY(IMIRROR) = THIS_NTRY(IMIRROR)+1
       DO I=1,NEXTERNAL
@@ -249,7 +248,6 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         IHEL = HEL_PICKED
       ELSE
         ANS = 1D0
-        call counters_smatrix1_stop()
         RETURN
       ENDIF
       IF (ANS.NE.0D0.AND.(ISUM_HEL .NE. 1.OR.HEL_PICKED.EQ.-1)) THEN
@@ -294,8 +292,9 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
         ENDIF
       ENDIF
       ANS=ANS/DBLE(IDEN)
+
       CALL SELECT_COLOR(RCOL, JAMP2, ICONFIG,1,  ICOL)
-      call counters_smatrix1_stop()
+
       END
 
 
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/counters.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/counters.cc
index 742575a6a5..8ef58cce80 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/counters.cc
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/counters.cc
@@ -21,26 +21,24 @@ extern "C"
 {
   // Now: fortran=-1, cudacpp=0
   // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
+  constexpr unsigned int nimplC = 3;
   constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
   const char* iimplC2TXT( int iimplC )
   {
     const int iimplF = iimplC - 1;
     switch( iimplF )
     {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
+      case -1: return "Fortran MEs"; break;
+      case +0: return "CudaCpp MEs"; break;
+      case +1: return "CudaCpp HEL"; break;
       default: assert( false ); break;
     }
   }
 
   static mgOnGpu::Timer<TIMERTYPE> program_timer;
   static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
   static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
   static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
   static int smatrix1multi_counter[nimplC] = { 0 };
 
   void counters_initialise_()
@@ -49,19 +47,6 @@ extern "C"
     return;
   }
 
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
   void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
   {
     const unsigned int iimplC = iimplF2C( *iimplF );
@@ -86,13 +71,23 @@ extern "C"
     printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
     printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
     for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
+    {
       if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+      {
+        if( iimplC < nimplC - 1 ) // MEs
+          printf( " [COUNTERS] %11s      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
+                  iimplC2TXT( iimplC ),
+                  iimplC + 1,
+                  smatrix1multi_totaltime[iimplC],
+                  smatrix1multi_counter[iimplC],
+                  smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+        else
+          printf( " [COUNTERS] %11s      ( %1d ) : %9.4fs\n",
+                  iimplC2TXT( iimplC ),
+                  iimplC + 1,
+                  smatrix1multi_totaltime[iimplC] );
+      }
+    }
     return;
   }
 }
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk
index 359f16c029..9cff5e1a60 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk
@@ -116,15 +116,31 @@ override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 # Set HIP_HOME from the path to hipcc, if it exists
 override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
 
-# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists
-# (FIXME? Is there any equivalent of NVTX FOR HIP? What should be configured if both CUDA and HIP are installed?)
-ifneq ($(CUDA_HOME),)
-  USE_NVTX ?=-DUSE_NVTX
-  CUDA_INC = -I$(CUDA_HOME)/include/
+# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
+ifeq ($(CUDA_HOME),)
+  # CUDA_HOME is empty (nvcc not found)
+  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/),)
+  # CUDA_HOME is defined (nvcc was found) but $(CUDA_HOME)/include/ does not exist?
+  override CUDA_INC=
 else
+  CUDA_INC = -I$(CUDA_HOME)/include/
+endif
+###$(info CUDA_INC=$(CUDA_INC))
+
+# Configure NVTX if a CUDA include directory exists and NVTX headers exist (see #965)
+ifeq ($(CUDA_INC),)
+  # $(CUDA_HOME)/include/ does not exist
   override USE_NVTX=
-  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/nvtx3/nvToolsExt.h),)
+  # $(CUDA_HOME)/include/ exists but NVTX headers do not exist?
+  override USE_NVTX=
+else
+  # $(CUDA_HOME)/include/nvtx.h exists: use NVTX
+  # (NB: the option to disable NVTX if 'USE_NVTX=' is defined has been removed)
+  override USE_NVTX=-DUSE_NVTX
 endif
+###$(info USE_NVTX=$(USE_NVTX))
 
 # NB: NEW LOGIC FOR ENABLING AND DISABLING CUDA OR HIP BUILDS (AV Feb-Mar 2024)
 # - In the old implementation, by default the C++ targets for one specific AVX were always built together with either CUDA or HIP.
@@ -424,13 +440,18 @@ endif
 # (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...)
 ifeq ($(HASCURAND),)
   ifeq ($(GPUCC),) # CPU-only build
-    ifneq ($(CUDA_HOME),)
+    ifeq ($(CUDA_INC),)
+      # $(CUDA_HOME)/include/ does not exist (see #965)
+      override HASCURAND = hasNoCurand
+    else ifeq ($(wildcard $(CUDA_HOME)/include/curand.h),)
+      # $(CUDA_HOME)/include/ exists but CURAND headers do not exist? (see #965)
+      override HASCURAND = hasNoCurand
+    else
       # By default, assume that curand is installed if a CUDA installation exists
       override HASCURAND = hasCurand
-    else
-      override HASCURAND = hasNoCurand
     endif
   else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    # By default, assume that curand is installed if a CUDA build is requested
     override HASCURAND = hasCurand
   else # non-Nvidia GPU build
     override HASCURAND = hasNoCurand
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.cc
index 8a5b8be9c0..99efcb1dbe 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.cc
@@ -83,6 +83,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_( CppObjectInFortran** ppbridge,
                          const FORTRANFPTYPE* momenta,
@@ -92,18 +93,20 @@ extern "C"
                          const unsigned int* pchannelId,
                          FORTRANFPTYPE* mes,
                          int* selhel,
-                         int* selcol )
+                         int* selcol,
+                         const bool* pgoodHelOnly )
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
+    //printf("fbridgesequence_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
 #ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
-    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #else
     // Use the host/CPU implementation in the C++ library
     // (there is no device implementation in this library)
-    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #endif
   }
 
@@ -119,6 +122,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
                                         const FORTRANFPTYPE* momenta,
@@ -127,9 +131,11 @@ extern "C"
                                         const FORTRANFPTYPE* rndcol,
                                         FORTRANFPTYPE* mes,
                                         int* selhel,
-                                        int* selcol )
+                                        int* selcol,
+                                        const bool* pgoodHelOnly )
   {
-    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol );
+    //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
+    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly );
   }
 
   /**
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.inc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.inc
index 422aa67cf9..a28622cdb6 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.inc
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/fbridge.inc
@@ -40,10 +40,11 @@ C - CHANID:  the input Feynman diagram to enhance in multi-channel mode if 1 to
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -53,6 +54,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE
       END INTERFACE
 
@@ -66,10 +68,11 @@ C - RNDCOL:  the input random number Fortran array for color selection
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -78,6 +81,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL
       END INTERFACE
 
diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt
index 45e10ca3ac..8479028997 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt
+++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt
@@ -554,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
 INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.122 s
+1 processes with 3 diagrams generated in 0.121 s
 Total: 1 processes with 3 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_tt
 Load PLUGIN.CUDACPP_OUTPUT
@@ -581,7 +581,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.137 s
+ALOHA: aloha creates 2 routines in  0.136 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -596,7 +596,7 @@ INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in dire
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. 
 quit
 
-real	0m1.308s
-user	0m1.215s
-sys	0m0.063s
+real	0m1.278s
+user	0m1.188s
+sys	0m0.072s
 Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h
index 4bf2198dd1..60eb101a6a 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h
@@ -109,9 +109,9 @@ namespace mg5amcCpu
      * @param rndcol the pointer to the input random numbers for color selection
      * @param channelId the Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0)
      * @param mes the pointer to the output matrix elements
-     * @param goodHelOnly quit after computing good helicities?
      * @param selhel the pointer to the output selected helicities
      * @param selcol the pointer to the output selected colors
+     * @param goodHelOnly quit after computing good helicities?
      */
     void gpu_sequence( const FORTRANFPTYPE* momenta,
                        const FORTRANFPTYPE* gs,
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fcheck_sa.f b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fcheck_sa.f
index 37d586be72..f0220047d7 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fcheck_sa.f
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fcheck_sa.f
@@ -63,7 +63,7 @@ PROGRAM FCHECK_SA
           GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
         END DO
         CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
-     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &    RNDHEL, RNDCOL, MES, SELHEL, SELCOL, .FALSE.) ! do not quit after computing helicities
         DO IEVT = 1, NEVT
 c         DO IEXTERNAL = 1, NEXTERNAL
 c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk
index 359f16c029..9cff5e1a60 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk
@@ -116,15 +116,31 @@ override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 # Set HIP_HOME from the path to hipcc, if it exists
 override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
 
-# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists
-# (FIXME? Is there any equivalent of NVTX FOR HIP? What should be configured if both CUDA and HIP are installed?)
-ifneq ($(CUDA_HOME),)
-  USE_NVTX ?=-DUSE_NVTX
-  CUDA_INC = -I$(CUDA_HOME)/include/
+# Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
+ifeq ($(CUDA_HOME),)
+  # CUDA_HOME is empty (nvcc not found)
+  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/),)
+  # CUDA_HOME is defined (nvcc was found) but $(CUDA_HOME)/include/ does not exist?
+  override CUDA_INC=
 else
+  CUDA_INC = -I$(CUDA_HOME)/include/
+endif
+###$(info CUDA_INC=$(CUDA_INC))
+
+# Configure NVTX if a CUDA include directory exists and NVTX headers exist (see #965)
+ifeq ($(CUDA_INC),)
+  # $(CUDA_HOME)/include/ does not exist
   override USE_NVTX=
-  override CUDA_INC=
+else ifeq ($(wildcard $(CUDA_HOME)/include/nvtx3/nvToolsExt.h),)
+  # $(CUDA_HOME)/include/ exists but NVTX headers do not exist?
+  override USE_NVTX=
+else
+  # $(CUDA_HOME)/include/nvtx.h exists: use NVTX
+  # (NB: the option to disable NVTX if 'USE_NVTX=' is defined has been removed)
+  override USE_NVTX=-DUSE_NVTX
 endif
+###$(info USE_NVTX=$(USE_NVTX))
 
 # NB: NEW LOGIC FOR ENABLING AND DISABLING CUDA OR HIP BUILDS (AV Feb-Mar 2024)
 # - In the old implementation, by default the C++ targets for one specific AVX were always built together with either CUDA or HIP.
@@ -424,13 +440,18 @@ endif
 # (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...)
 ifeq ($(HASCURAND),)
   ifeq ($(GPUCC),) # CPU-only build
-    ifneq ($(CUDA_HOME),)
+    ifeq ($(CUDA_INC),)
+      # $(CUDA_HOME)/include/ does not exist (see #965)
+      override HASCURAND = hasNoCurand
+    else ifeq ($(wildcard $(CUDA_HOME)/include/curand.h),)
+      # $(CUDA_HOME)/include/ exists but CURAND headers do not exist? (see #965)
+      override HASCURAND = hasNoCurand
+    else
       # By default, assume that curand is installed if a CUDA installation exists
       override HASCURAND = hasCurand
-    else
-      override HASCURAND = hasNoCurand
     endif
   else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    # By default, assume that curand is installed if a CUDA build is requested
     override HASCURAND = hasCurand
   else # non-Nvidia GPU build
     override HASCURAND = hasNoCurand
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.cc
index 8a5b8be9c0..99efcb1dbe 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.cc
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.cc
@@ -83,6 +83,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_( CppObjectInFortran** ppbridge,
                          const FORTRANFPTYPE* momenta,
@@ -92,18 +93,20 @@ extern "C"
                          const unsigned int* pchannelId,
                          FORTRANFPTYPE* mes,
                          int* selhel,
-                         int* selcol )
+                         int* selcol,
+                         const bool* pgoodHelOnly )
   {
     Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
+    //printf("fbridgesequence_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
     if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
 #ifdef MGONGPUCPP_GPUIMPL
     // Use the device/GPU implementation in the CUDA library
     // (there is also a host implementation in this library)
-    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #else
     // Use the host/CPU implementation in the C++ library
     // (there is no device implementation in this library)
-    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol );
+    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol, *pgoodHelOnly );
 #endif
   }
 
@@ -119,6 +122,7 @@ extern "C"
    * @param mes the pointer to the output matrix elements
    * @param selhel the pointer to the output selected helicities
    * @param selcol the pointer to the output selected colors
+   * @param goodHelOnly quit after computing good helicities?
    */
   void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge,
                                         const FORTRANFPTYPE* momenta,
@@ -127,9 +131,11 @@ extern "C"
                                         const FORTRANFPTYPE* rndcol,
                                         FORTRANFPTYPE* mes,
                                         int* selhel,
-                                        int* selcol )
+                                        int* selcol,
+                                        const bool* pgoodHelOnly )
   {
-    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol );
+    //printf("fbridgesequence_nomultichannel_ goodHelOnly=%d\n", ( *pgoodHelOnly ? 1 : 0 ) );
+    fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol, pgoodHelOnly );
   }
 
   /**
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.inc
index 422aa67cf9..a28622cdb6 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.inc
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.inc
@@ -40,10 +40,11 @@ C - CHANID:  the input Feynman diagram to enhance in multi-channel mode if 1 to
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -53,6 +54,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE
       END INTERFACE
 
@@ -66,10 +68,11 @@ C - RNDCOL:  the input random number Fortran array for color selection
 C - MES:     the output matrix element Fortran array
 C - SELHEL:  the output selected helicity Fortran array
 C - SELCOL:  the output selected color Fortran array
+C - HELONLY: input flag, quit after computing good helicities?
 C
       INTERFACE
          SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS,
-     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL)
+     &     RNDHEL, RNDCOL, MES, SELHEL, SELCOL, HELONLY)
          INTEGER*8 PBRIDGE
          DOUBLE PRECISION MOMENTA(*)
          DOUBLE PRECISION GS(*)
@@ -78,6 +81,7 @@ C
          DOUBLE PRECISION MES(*)
          INTEGER*4 SELHEL(*)
          INTEGER*4 SELCOL(*)
+         LOGICAL HELONLY
          END SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL
       END INTERFACE
 
diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
index b8847b7cb6..01107f564b 100644
--- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
@@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
@@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2024-06-28_21:15:24
+DATE: 2024-08-08_20:42:55
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 3798 events (found 8192 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7267s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7180s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0087s for     8192 events => throughput is 9.36E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6950s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6868s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0082s for     8192 events => throughput is 1.00E+06 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 1591 events (found 1595 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1784s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1702s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0082s for     8192 events => throughput is 1.00E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1770s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1693s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0077s for     8192 events => throughput is 1.07E+06 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 1782 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3705s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2837s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0868s for    90112 events => throughput is 1.04E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3730s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2895s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0835s for    90112 events => throughput is 1.08E+06 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,10 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09338 [9.3382715404661545E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1591 events (found 1595 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1849s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1783s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0066s for     8192 events => throughput is 1.24E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1777s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1702s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0072s for     8192 events => throughput is 1.14E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +167,10 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1782 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3665s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2911s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0754s for    90112 events => throughput is 1.19E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3648s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2879s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0766s for    90112 events => throughput is 1.18E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -181,12 +183,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.152979e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.167196e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.181726e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.165900e+06                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +211,10 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1591 events (found 1595 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1799s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1755s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0044s for     8192 events => throughput is 1.87E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1752s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1704s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0045s for     8192 events => throughput is 1.83E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +245,10 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1782 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3337s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2882s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0455s for    90112 events => throughput is 1.98E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3353s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2887s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0463s for    90112 events => throughput is 1.94E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -257,12 +261,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.930086e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.918558e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.984749e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.023579e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +289,10 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1591 events (found 1595 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1789s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1758s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.61E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1786s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1750s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0033s for     8192 events => throughput is 2.48E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +323,10 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1782 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3219s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2881s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0338s for    90112 events => throughput is 2.67E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3295s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2928s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0363s for    90112 events => throughput is 2.48E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -333,12 +339,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.539184e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.640473e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.709927e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.831088e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +367,10 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1591 events (found 1595 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1781s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1751s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0030s for     8192 events => throughput is 2.77E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1752s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1718s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.65E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +401,10 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1782 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3264s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2932s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0332s for    90112 events => throughput is 2.71E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3209s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2867s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0338s for    90112 events => throughput is 2.66E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -409,12 +417,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.635946e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.678759e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.766552e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.813366e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +445,10 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1591 events (found 1595 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1769s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1731s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0038s for     8192 events => throughput is 2.16E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1736s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1692s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0040s for     8192 events => throughput is 2.04E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +479,10 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1782 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3421s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3007s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0414s for    90112 events => throughput is 2.18E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3322s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2913s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0405s for    90112 events => throughput is 2.22E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -485,12 +495,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.124836e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.108602e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.174846e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.253882e+06                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +523,10 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1591 events (found 1595 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6131s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6126s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.49E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6096s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6084s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.32E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +557,10 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1782 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7401s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7351s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0050s for    90112 events => throughput is 1.79E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7166s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7111s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0049s for    90112 events => throughput is 1.84E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -561,42 +573,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.749639e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.377977e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.937376e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.939853e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.647566e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.088090e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.462558e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.478718e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.641748e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.243737e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.002002e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.989285e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.659137e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.238682e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.136180e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.131222e+08                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
index 587bb76d73..617aae1ec8 100644
--- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
@@ -1,17 +1,17 @@
 Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 
 
-make USEBUILDDIR=1 BACKEND=cuda
 
+make USEBUILDDIR=1 BACKEND=cuda
 make USEBUILDDIR=1 BACKEND=cppnone
-
 make USEBUILDDIR=1 BACKEND=cppsse4
-make USEBUILDDIR=1 BACKEND=cppavx2
 
-make USEBUILDDIR=1 BACKEND=cpp512y
+make USEBUILDDIR=1 BACKEND=cppavx2
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
@@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2024-06-28_21:15:40
+DATE: 2024-08-08_20:43:11
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 3798 events (found 8192 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7250s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7166s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0084s for     8192 events => throughput is 9.79E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7259s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7175s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0084s for     8192 events => throughput is 9.72E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 1591 events (found 1595 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1873s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1787s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0086s for     8192 events => throughput is 9.52E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1878s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1797s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0081s for     8192 events => throughput is 1.01E+06 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 1782 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3976s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3042s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0933s for    90112 events => throughput is 9.65E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3875s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3018s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0857s for    90112 events => throughput is 1.05E+06 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,10 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09338 [9.3382703205998396E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1591 events (found 1595 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1981s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1911s
+ [COUNTERS] PROGRAM TOTAL          :    0.1866s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1794s
  [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0070s for     8192 events => throughput is 1.17E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +167,10 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09152 [9.1515590123565249E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1782 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3643s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2913s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0730s for    90112 events => throughput is 1.24E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3784s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3020s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0761s for    90112 events => throughput is 1.18E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -181,12 +183,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.200646e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.232262e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.237867e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.234403e+06                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +211,10 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09338 [9.3382700723828302E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1591 events (found 1595 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1729s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1703s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0025s for     8192 events => throughput is 3.23E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1808s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1776s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0028s for     8192 events => throughput is 2.88E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +245,10 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09152 [9.1515587612890761E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1782 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3200s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2910s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0290s for    90112 events => throughput is 3.10E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3276s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2977s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0297s for    90112 events => throughput is 3.03E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0002s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -257,12 +261,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.187376e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.119755e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.220665e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.282267e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +289,10 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09338 [9.3382700679354239E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1591 events (found 1595 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1773s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1751s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0023s for     8192 events => throughput is 3.62E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1827s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1799s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0025s for     8192 events => throughput is 3.27E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0002s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +323,10 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09152 [9.1515587619408464E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1782 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3126s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2872s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0254s for    90112 events => throughput is 3.54E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3317s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3038s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0276s for    90112 events => throughput is 3.26E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0002s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -333,12 +339,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.488004e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.481016e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.612737e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.570800e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +367,10 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09338 [9.3382700679354239E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1591 events (found 1595 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1769s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1747s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0022s for     8192 events => throughput is 3.74E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1855s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1828s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0024s for     8192 events => throughput is 3.35E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0002s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +401,10 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09152 [9.1515587619408464E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1782 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3121s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2873s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0247s for    90112 events => throughput is 3.65E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3314s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3041s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0270s for    90112 events => throughput is 3.33E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -409,12 +417,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.702442e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.644439e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.804759e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.697078e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +445,10 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09338 [9.3382704335459282E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1591 events (found 1595 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1768s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1743s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0024s for     8192 events => throughput is 3.39E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1845s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1814s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0027s for     8192 events => throughput is 3.04E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +479,10 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09152 [9.1515591296252558E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1782 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3213s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2952s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0261s for    90112 events => throughput is 3.45E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3372s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3079s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0290s for    90112 events => throughput is 3.10E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -485,12 +495,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.323941e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.387501e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.761942e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.616268e+06                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +523,10 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09338 [9.3382706077425631E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1591 events (found 1595 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5917s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5912s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.66E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6084s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6073s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.48E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +557,10 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09152 [9.1515592892887687E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1782 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7121s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7074s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0047s for    90112 events => throughput is 1.92E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7292s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7238s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0048s for    90112 events => throughput is 1.86E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -561,42 +573,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.122217e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.601368e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.609665e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.718163e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.582586e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.633474e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.902910e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.898384e+09                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.573732e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.829286e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.085324e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.104797e+09                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.048658e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.012752e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.737473e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.802072e+08                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
index f580fe1044..e51bbf394d 100644
--- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
@@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
@@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2024-06-28_21:15:57
+DATE: 2024-08-08_20:43:26
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 3798 events (found 8192 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6825s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6742s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0083s for     8192 events => throughput is 9.83E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6983s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6906s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0077s for     8192 events => throughput is 1.06E+06 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,8 +83,8 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 1591 events (found 1595 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1788s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1708s
+ [COUNTERS] PROGRAM TOTAL          :    0.1791s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1711s
  [COUNTERS] Fortran MEs      ( 1 ) :    0.0080s for     8192 events => throughput is 1.02E+06 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 1782 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3707s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2832s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0875s for    90112 events => throughput is 1.03E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3694s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2869s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0825s for    90112 events => throughput is 1.09E+06 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,10 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09338 [9.3382715420701395E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1591 events (found 1595 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1866s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1792s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0073s for     8192 events => throughput is 1.12E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1846s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1767s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0075s for     8192 events => throughput is 1.09E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +167,10 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09152 [9.1515602033080859E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1782 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3744s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2956s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0787s for    90112 events => throughput is 1.14E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3660s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2865s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0792s for    90112 events => throughput is 1.14E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -181,12 +183,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.113565e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.124575e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.142626e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.154252e+06                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +211,10 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09338 [9.3382715420701354E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1591 events (found 1595 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1769s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1728s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0040s for     8192 events => throughput is 2.04E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1757s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1709s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0044s for     8192 events => throughput is 1.88E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +245,10 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09152 [9.1515602033080859E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1782 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3372s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2926s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0446s for    90112 events => throughput is 2.02E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3336s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2878s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0455s for    90112 events => throughput is 1.98E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -257,12 +261,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.964919e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.982594e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.051752e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.052848e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +289,10 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1591 events (found 1595 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1805s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1771s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0033s for     8192 events => throughput is 2.46E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1749s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1711s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0035s for     8192 events => throughput is 2.31E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +323,10 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1782 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3444s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3072s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0371s for    90112 events => throughput is 2.43E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3282s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2920s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0358s for    90112 events => throughput is 2.51E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -333,12 +339,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.534941e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.552156e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.720296e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.649390e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +367,10 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1591 events (found 1595 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1857s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1827s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0030s for     8192 events => throughput is 2.69E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1744s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1708s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0033s for     8192 events => throughput is 2.47E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +401,10 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1782 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3391s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3032s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0359s for    90112 events => throughput is 2.51E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3217s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2876s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0337s for    90112 events => throughput is 2.67E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -409,12 +417,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.692433e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.650509e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.750694e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.719714e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +445,10 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1591 events (found 1595 events)
- [COUNTERS] PROGRAM TOTAL          :    0.1831s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.1796s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0035s for     8192 events => throughput is 2.31E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.1750s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.1712s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0035s for     8192 events => throughput is 2.36E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +479,10 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1782 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3483s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3066s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0416s for    90112 events => throughput is 2.16E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3264s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2866s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0395s for    90112 events => throughput is 2.28E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -485,12 +495,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.156388e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.207219e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.257287e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.300574e+06                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +523,10 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09338 [9.3382715392009194E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1591 events (found 1595 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5936s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5931s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.57E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5992s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5980s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.38E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +557,10 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09152 [9.1515602021089631E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1782 events (found 1787 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7192s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7142s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0050s for    90112 events => throughput is 1.81E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7158s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7101s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0050s for    90112 events => throughput is 1.80E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -561,42 +573,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.699826e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.054665e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.973881e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.970842e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.642059e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.242307e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.500425e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.491734e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.034916e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.221256e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.050339e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.104459e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.807602e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.208981e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.156854e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.160987e+08                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
index ccecc02825..8d24f348d7 100644
--- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
@@ -3,16 +3,16 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g
 make USEBUILDDIR=1 BACKEND=cuda
 
 
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make USEBUILDDIR=1 BACKEND=cppnone
-make USEBUILDDIR=1 BACKEND=cppsse4
 
+make USEBUILDDIR=1 BACKEND=cppsse4
 make USEBUILDDIR=1 BACKEND=cppavx2
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-
-make USEBUILDDIR=1 BACKEND=cpp512y
 
 make USEBUILDDIR=1 BACKEND=cpp512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
@@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2024-06-28_21:16:14
+DATE: 2024-08-08_20:43:42
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0
  [UNWEIGHT] Wrote 2601 events (found 5405 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8426s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7989s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0437s for     8192 events => throughput is 1.87E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8083s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7667s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0416s for     8192 events => throughput is 1.97E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4393s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3952s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0441s for     8192 events => throughput is 1.86E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4194s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3777s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0417s for     8192 events => throughput is 1.97E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0
  [UNWEIGHT] Wrote 1744 events (found 1749 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8386s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.3588s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.4798s for    90112 events => throughput is 1.88E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7491s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2980s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.4511s for    90112 events => throughput is 2.00E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,10 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.09 [47.094184803756647] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5057s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4597s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0460s for     8192 events => throughput is 1.78E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4196s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3765s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0426s for     8192 events => throughput is 1.92E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +167,10 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.11 [47.105695279989099] fbridge_mode=1
  [UNWEIGHT] Wrote 1744 events (found 1749 events)
- [COUNTERS] PROGRAM TOTAL          :    1.9929s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4771s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5158s for    90112 events => throughput is 1.75E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7813s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2997s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4811s for    90112 events => throughput is 1.87E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -181,12 +183,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.819871e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.879822e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.838165e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.903748e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +211,10 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.09 [47.094184803756647] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4393s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4133s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0260s for     8192 events => throughput is 3.15E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4107s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3853s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0250s for     8192 events => throughput is 3.28E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +245,10 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.11 [47.105695279989106] fbridge_mode=1
  [UNWEIGHT] Wrote 1744 events (found 1749 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6401s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.3507s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2894s for    90112 events => throughput is 3.11E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.5717s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3004s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2709s for    90112 events => throughput is 3.33E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -257,12 +261,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.138027e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.310019e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.282175e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.203674e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +289,10 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.09 [47.094184803756647] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4390s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4228s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0162s for     8192 events => throughput is 5.07E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3916s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3758s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0153s for     8192 events => throughput is 5.35E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +323,10 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.11 [47.105695279989135] fbridge_mode=1
  [UNWEIGHT] Wrote 1744 events (found 1749 events)
- [COUNTERS] PROGRAM TOTAL          :    1.5071s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.3263s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1808s for    90112 events => throughput is 4.98E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4759s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3059s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1696s for    90112 events => throughput is 5.31E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -333,12 +339,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.258519e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.223657e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.342811e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.200982e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +367,10 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.09 [47.094184803756647] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4034s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3888s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0146s for     8192 events => throughput is 5.60E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3953s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3808s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0141s for     8192 events => throughput is 5.80E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +401,10 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.11 [47.105695279989135] fbridge_mode=1
  [UNWEIGHT] Wrote 1744 events (found 1749 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4583s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2960s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1623s for    90112 events => throughput is 5.55E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4542s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3022s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1516s for    90112 events => throughput is 5.94E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -409,12 +417,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.840330e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.865744e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.907730e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.035557e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +445,10 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.09 [47.094184803756647] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3956s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3739s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0217s for     8192 events => throughput is 3.78E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4098s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3854s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0239s for     8192 events => throughput is 3.43E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +479,10 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.11 [47.105695279989135] fbridge_mode=1
  [UNWEIGHT] Wrote 1744 events (found 1749 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4701s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2380s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2321s for    90112 events => throughput is 3.88E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.5428s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3039s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2384s for    90112 events => throughput is 3.78E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -485,12 +495,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.585624e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.669812e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.722456e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.898434e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +523,10 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8009s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8003s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.42E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8047s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8033s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.27E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +557,10 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1
  [UNWEIGHT] Wrote 1744 events (found 1749 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7129s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.7060s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0069s for    90112 events => throughput is 1.31E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7304s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7231s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0066s for    90112 events => throughput is 1.37E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
 
 *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -561,42 +573,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.914318e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.008892e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.613535e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.654647e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.870641e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.331472e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.085338e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.082448e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.804787e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.310542e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.159663e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.160861e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.702330e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.331806e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.067580e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.063253e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
index f46b75eef7..420861126b 100644
--- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
@@ -1,11 +1,11 @@
 Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 
-
 make USEBUILDDIR=1 BACKEND=cuda
 
-make USEBUILDDIR=1 BACKEND=cppnone
 
+make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
+
 make USEBUILDDIR=1 BACKEND=cppavx2
 
 make USEBUILDDIR=1 BACKEND=cpp512y
@@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
@@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2024-06-28_21:16:41
+DATE: 2024-08-08_20:44:09
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0
  [UNWEIGHT] Wrote 2601 events (found 5405 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7783s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7371s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0411s for     8192 events => throughput is 1.99E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8019s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7604s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0415s for     8192 events => throughput is 1.97E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3978s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3570s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0407s for     8192 events => throughput is 2.01E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4215s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3800s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0416s for     8192 events => throughput is 1.97E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0
  [UNWEIGHT] Wrote 1744 events (found 1749 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6995s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2512s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.4483s for    90112 events => throughput is 2.01E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7567s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3058s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.4510s for    90112 events => throughput is 2.00E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,10 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.09 [47.094179692708323] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4346s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3943s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0403s for     8192 events => throughput is 2.03E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4203s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3790s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0410s for     8192 events => throughput is 2.00E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +167,10 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.11 [47.105688388783328] fbridge_mode=1
  [UNWEIGHT] Wrote 1744 events (found 1749 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6901s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2471s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4430s for    90112 events => throughput is 2.03E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7678s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3093s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4582s for    90112 events => throughput is 1.97E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -181,12 +183,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.030986e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.984608e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.985412e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.996032e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +211,10 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.09 [47.094175707109216] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3873s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3704s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0168s for     8192 events => throughput is 4.87E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3923s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3751s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0170s for     8192 events => throughput is 4.83E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +245,10 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.11 [47.105684583433771] fbridge_mode=1
  [UNWEIGHT] Wrote 1744 events (found 1749 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4062s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2257s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1804s for    90112 events => throughput is 4.99E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4893s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3053s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1837s for    90112 events => throughput is 4.90E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -257,12 +261,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.724911e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.831484e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.762208e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.765454e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +289,10 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.09 [47.094173726920275] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3731s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3646s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0085s for     8192 events => throughput is 9.69E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3873s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3779s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0091s for     8192 events => throughput is 8.99E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +323,10 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.11 [47.105684037363524] fbridge_mode=1
  [UNWEIGHT] Wrote 1744 events (found 1749 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3134s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2174s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0960s for    90112 events => throughput is 9.38E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4091s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3116s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0972s for    90112 events => throughput is 9.27E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -333,12 +339,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.106025e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.995090e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.235160e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.148417e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +367,10 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.09 [47.094173726920275] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3725s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3646s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0079s for     8192 events => throughput is 1.04E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3894s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3807s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0085s for     8192 events => throughput is 9.68E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +401,10 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.11 [47.105684037363524] fbridge_mode=1
  [UNWEIGHT] Wrote 1744 events (found 1749 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3066s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2161s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0906s for    90112 events => throughput is 9.95E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3961s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3040s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0917s for    90112 events => throughput is 9.82E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -409,12 +417,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.917882e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.994646e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.709952e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.882184e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +445,10 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.09 [47.094178448427996] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3747s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3637s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0110s for     8192 events => throughput is 7.46E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3945s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3828s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0112s for     8192 events => throughput is 7.31E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +479,10 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.11 [47.105688391432061] fbridge_mode=1
  [UNWEIGHT] Wrote 1744 events (found 1749 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3429s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2200s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1229s for    90112 events => throughput is 7.33E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.5017s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3657s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1356s for    90112 events => throughput is 6.64E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -485,12 +495,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.804453e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.837763e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.929498e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.925566e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +523,10 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.09 [47.094184162782994] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7869s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7863s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.57E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8112s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8099s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.43E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0007s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +557,10 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.11 [47.105694501043516] fbridge_mode=1
  [UNWEIGHT] Wrote 1744 events (found 1749 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6604s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6548s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0056s for    90112 events => throughput is 1.61E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7829s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7765s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0057s for    90112 events => throughput is 1.58E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0007s
 
 *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -561,42 +573,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.908595e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.085941e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.266019e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.178660e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.049207e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.983696e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.416476e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.406286e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.062460e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.010543e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.551978e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.536473e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.413712e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.527299e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.519394e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.475317e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
index dc1bcf4827..65f004f30e 100644
--- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
@@ -3,20 +3,20 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g
 make USEBUILDDIR=1 BACKEND=cuda
 
 
+
 make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
-
 make USEBUILDDIR=1 BACKEND=cppavx2
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-
-make USEBUILDDIR=1 BACKEND=cpp512y
 
 make USEBUILDDIR=1 BACKEND=cpp512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
@@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2024-06-28_21:17:06
+DATE: 2024-08-08_20:44:34
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0
  [UNWEIGHT] Wrote 2601 events (found 5405 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7779s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7372s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0407s for     8192 events => throughput is 2.01E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8115s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7704s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0411s for     8192 events => throughput is 1.99E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3947s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3534s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0413s for     8192 events => throughput is 1.99E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4214s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3805s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0408s for     8192 events => throughput is 2.01E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0
  [UNWEIGHT] Wrote 1744 events (found 1749 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7153s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2615s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.4538s for    90112 events => throughput is 1.99E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7670s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3128s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.4542s for    90112 events => throughput is 1.98E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,10 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.09 [47.094186141863901] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4374s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3938s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0436s for     8192 events => throughput is 1.88E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4222s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3775s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0442s for     8192 events => throughput is 1.85E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +167,10 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.11 [47.105696630006634] fbridge_mode=1
  [UNWEIGHT] Wrote 1744 events (found 1749 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7264s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2487s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4777s for    90112 events => throughput is 1.89E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7889s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3008s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4877s for    90112 events => throughput is 1.85E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -181,12 +183,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.883629e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.863098e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.895894e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.876650e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +211,10 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.09 [47.094186141863901] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4011s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3774s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0237s for     8192 events => throughput is 3.45E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4042s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3795s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0242s for     8192 events => throughput is 3.38E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +245,10 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.11 [47.105696630006626] fbridge_mode=1
  [UNWEIGHT] Wrote 1744 events (found 1749 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4880s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2278s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2602s for    90112 events => throughput is 3.46E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.5750s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3065s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2680s for    90112 events => throughput is 3.36E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -257,12 +261,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.317423e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.334875e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.373113e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.372227e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +289,10 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.09 [47.094186169585456] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3877s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3732s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0145s for     8192 events => throughput is 5.66E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3946s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3794s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0147s for     8192 events => throughput is 5.56E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +323,10 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.11 [47.105696663215774] fbridge_mode=1
  [UNWEIGHT] Wrote 1744 events (found 1749 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3817s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2190s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1627s for    90112 events => throughput is 5.54E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4696s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3034s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1658s for    90112 events => throughput is 5.44E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -333,12 +339,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.391740e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.223051e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.381923e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.767945e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +367,10 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.09 [47.094186169585456] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3811s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3680s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0131s for     8192 events => throughput is 6.26E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4019s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3874s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0142s for     8192 events => throughput is 5.78E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +401,10 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.11 [47.105696663215774] fbridge_mode=1
  [UNWEIGHT] Wrote 1744 events (found 1749 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3634s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2153s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1482s for    90112 events => throughput is 6.08E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4595s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3077s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1514s for    90112 events => throughput is 5.95E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -409,12 +417,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.907882e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.889622e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.901678e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.919078e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +445,10 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.09 [47.094186169585456] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3949s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3745s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0204s for     8192 events => throughput is 4.01E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4002s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3783s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0214s for     8192 events => throughput is 3.82E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +479,10 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.11 [47.105696663215774] fbridge_mode=1
  [UNWEIGHT] Wrote 1744 events (found 1749 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4596s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2305s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2291s for    90112 events => throughput is 3.93E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.5451s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3093s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2354s for    90112 events => throughput is 3.83E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -485,12 +495,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.830144e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.737875e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.799392e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.863403e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +523,10 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.09 [47.094184798437830] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7822s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7816s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.47E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8029s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8014s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.26E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +557,10 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.11 [47.105695279068492] fbridge_mode=1
  [UNWEIGHT] Wrote 1744 events (found 1749 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6477s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6414s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0063s for    90112 events => throughput is 1.42E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7390s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7315s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0066s for    90112 events => throughput is 1.36E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
 
 *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -561,42 +573,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.955290e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.004360e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.610728e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.618155e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.721513e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.337805e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.057949e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.064726e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.718962e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.321717e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.127927e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.141622e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.720504e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.487761e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.981768e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.948699e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
index daad34ef63..c52a8af2f9 100644
--- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
@@ -3,9 +3,9 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g
 make USEBUILDDIR=1 BACKEND=cuda
 
 
-
 make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
+
 make USEBUILDDIR=1 BACKEND=cppavx2
 
 make USEBUILDDIR=1 BACKEND=cpp512y
@@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
@@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2024-06-28_21:17:32
+DATE: 2024-08-08_20:45:01
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0
  [UNWEIGHT] Wrote 365 events (found 1496 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6619s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3441s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3178s for     8192 events => throughput is 2.58E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6887s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3666s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3221s for     8192 events => throughput is 2.54E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0
  [UNWEIGHT] Wrote 386 events (found 1179 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6512s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3209s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3303s for     8192 events => throughput is 2.48E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6558s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3350s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3208s for     8192 events => throughput is 2.55E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 1898 events (found 1903 events)
- [COUNTERS] PROGRAM TOTAL          :    5.0566s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4770s
- [COUNTERS] Fortran MEs      ( 1 ) :    3.5796s for    90112 events => throughput is 2.52E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    5.1103s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5412s
+ [COUNTERS] Fortran MEs      ( 1 ) :    3.5692s for    90112 events => throughput is 2.52E+04 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,10 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1
  [UNWEIGHT] Wrote 386 events (found 1179 events)
- [COUNTERS] PROGRAM TOTAL          :    0.9732s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6412s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3320s for     8192 events => throughput is 2.47E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6762s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3380s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3370s for     8192 events => throughput is 2.43E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0012s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +167,10 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07924 [7.9238481932717666E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1898 events (found 1903 events)
- [COUNTERS] PROGRAM TOTAL          :    5.4802s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.8183s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    3.6619s for    90112 events => throughput is 2.46E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    5.2687s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5495s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    3.7180s for    90112 events => throughput is 2.42E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0012s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -181,12 +183,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.542070e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.517328e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.548855e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.477316e+04                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +211,10 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.1011 [0.10112748607748863] fbridge_mode=1
  [UNWEIGHT] Wrote 386 events (found 1179 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6596s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4845s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1750s for     8192 events => throughput is 4.68E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5207s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3399s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1801s for     8192 events => throughput is 4.55E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +245,10 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07924 [7.9238481932717666E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1898 events (found 1903 events)
- [COUNTERS] PROGRAM TOTAL          :    3.5748s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6476s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.9272s for    90112 events => throughput is 4.68E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    3.4936s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5370s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.9559s for    90112 events => throughput is 4.61E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -257,12 +261,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.742087e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.723167e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.785084e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.710741e+04                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +289,10 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1
  [UNWEIGHT] Wrote 386 events (found 1179 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4868s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4003s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0864s for     8192 events => throughput is 9.48E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4289s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3383s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0899s for     8192 events => throughput is 9.11E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0007s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +323,10 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1898 events (found 1903 events)
- [COUNTERS] PROGRAM TOTAL          :    2.5034s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5585s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9448s for    90112 events => throughput is 9.54E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.5415s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5644s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9765s for    90112 events => throughput is 9.23E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -333,12 +339,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.300762e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.063994e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.558512e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.113779e+04                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +367,10 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1
  [UNWEIGHT] Wrote 386 events (found 1179 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4736s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3967s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0769s for     8192 events => throughput is 1.07E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4521s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3684s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0831s for     8192 events => throughput is 9.86E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0007s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +401,10 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1898 events (found 1903 events)
- [COUNTERS] PROGRAM TOTAL          :    2.4089s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5590s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8498s for    90112 events => throughput is 1.06E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.4440s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5615s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8818s for    90112 events => throughput is 1.02E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0007s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -409,12 +417,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.073418e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.056563e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.076553e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.066565e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +445,10 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1
  [UNWEIGHT] Wrote 386 events (found 1179 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5274s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4198s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1076s for     8192 events => throughput is 7.62E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4498s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3385s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1106s for     8192 events => throughput is 7.41E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +479,10 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1898 events (found 1903 events)
- [COUNTERS] PROGRAM TOTAL          :    2.8167s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6114s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.2053s for    90112 events => throughput is 7.48E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.7606s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5479s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.2119s for    90112 events => throughput is 7.44E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -485,12 +495,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.600195e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.524660e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.737653e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.502357e+04                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +523,10 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=1
  [UNWEIGHT] Wrote 386 events (found 1179 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7530s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7476s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0054s for     8192 events => throughput is 1.52E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8444s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8355s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0059s for     8192 events => throughput is 1.38E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0029s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +557,10 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07924 [7.9238481932717736E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1898 events (found 1903 events)
- [COUNTERS] PROGRAM TOTAL          :    1.9540s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.9311s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0229s for    90112 events => throughput is 3.93E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.9827s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.9565s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0233s for    90112 events => throughput is 3.86E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0028s
 
 *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -561,42 +573,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.632787e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.637288e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.212630e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.243124e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.415553e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.002014e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.240897e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.239487e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.415697e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.002136e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.251024e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.250655e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.408414e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.001900e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.766536e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.746731e+06                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
index 51c84bcce7..b25cff31e4 100644
--- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
@@ -2,12 +2,12 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g
 
 make USEBUILDDIR=1 BACKEND=cuda
 
-
 make USEBUILDDIR=1 BACKEND=cppnone
-make USEBUILDDIR=1 BACKEND=cppsse4
 
 
+make USEBUILDDIR=1 BACKEND=cppsse4
 make USEBUILDDIR=1 BACKEND=cppavx2
+
 make USEBUILDDIR=1 BACKEND=cpp512y
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
@@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2024-06-28_21:18:15
+DATE: 2024-08-08_20:45:43
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0
  [UNWEIGHT] Wrote 365 events (found 1496 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6622s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3419s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3203s for     8192 events => throughput is 2.56E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6879s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3658s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3221s for     8192 events => throughput is 2.54E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0
  [UNWEIGHT] Wrote 386 events (found 1179 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6540s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3233s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3307s for     8192 events => throughput is 2.48E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6575s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3322s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3252s for     8192 events => throughput is 2.52E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 1898 events (found 1903 events)
- [COUNTERS] PROGRAM TOTAL          :    5.0535s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4792s
- [COUNTERS] Fortran MEs      ( 1 ) :    3.5744s for    90112 events => throughput is 2.52E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    5.0903s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5245s
+ [COUNTERS] Fortran MEs      ( 1 ) :    3.5658s for    90112 events => throughput is 2.53E+04 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,10 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.1011 [0.10112722616246457] fbridge_mode=1
  [UNWEIGHT] Wrote 386 events (found 1179 events)
- [COUNTERS] PROGRAM TOTAL          :    0.9529s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6280s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3249s for     8192 events => throughput is 2.52E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6630s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3346s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3273s for     8192 events => throughput is 2.50E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0010s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +167,10 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07924 [7.9238468293717765E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1898 events (found 1903 events)
- [COUNTERS] PROGRAM TOTAL          :    5.7771s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.9289s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    3.8482s for    90112 events => throughput is 2.34E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    5.1318s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5454s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    3.5854s for    90112 events => throughput is 2.51E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0010s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -181,12 +183,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.295265e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.562809e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.274937e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.549301e+04                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +211,10 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.1011 [0.10112720694019242] fbridge_mode=1
  [UNWEIGHT] Wrote 386 events (found 1179 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5768s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4654s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1114s for     8192 events => throughput is 7.35E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4414s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3412s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0997s for     8192 events => throughput is 8.22E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +245,10 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07924 [7.9238454783817719E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1898 events (found 1903 events)
- [COUNTERS] PROGRAM TOTAL          :    2.9013s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.7217s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1796s for    90112 events => throughput is 7.64E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.6571s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5548s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1018s for    90112 events => throughput is 8.18E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -257,12 +261,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.824875e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.333170e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.970843e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.397937e+04                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +289,10 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.1011 [0.10112721757974454] fbridge_mode=1
  [UNWEIGHT] Wrote 386 events (found 1179 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4346s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3870s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0476s for     8192 events => throughput is 1.72E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3825s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3366s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0455s for     8192 events => throughput is 1.80E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +323,10 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07924 [7.9238453732924513E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1898 events (found 1903 events)
- [COUNTERS] PROGRAM TOTAL          :    2.2132s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6840s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5293s for    90112 events => throughput is 1.70E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.0649s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5567s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5077s for    90112 events => throughput is 1.77E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -333,12 +339,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.559003e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.821951e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.455805e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.834362e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +367,10 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.1011 [0.10112721757974454] fbridge_mode=1
  [UNWEIGHT] Wrote 386 events (found 1179 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4492s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4015s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0477s for     8192 events => throughput is 1.72E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3803s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3381s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0418s for     8192 events => throughput is 1.96E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +401,10 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07924 [7.9238453732924513E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1898 events (found 1903 events)
- [COUNTERS] PROGRAM TOTAL          :    2.1232s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6431s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4802s for    90112 events => throughput is 1.88E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.0303s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5712s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4587s for    90112 events => throughput is 1.96E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -409,12 +417,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.558817e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.018262e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.780926e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.019326e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +445,10 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.1011 [0.10112723389095883] fbridge_mode=1
  [UNWEIGHT] Wrote 386 events (found 1179 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4670s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4037s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0633s for     8192 events => throughput is 1.29E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3929s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3375s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0549s for     8192 events => throughput is 1.49E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +479,10 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07924 [7.9238464413054557E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1898 events (found 1903 events)
- [COUNTERS] PROGRAM TOTAL          :    2.3110s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6631s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.6480s for    90112 events => throughput is 1.39E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.1189s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5295s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5889s for    90112 events => throughput is 1.53E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -485,12 +495,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.404960e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.561264e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.473914e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.545662e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +523,10 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.1011 [0.10112725654777677] fbridge_mode=1
  [UNWEIGHT] Wrote 386 events (found 1179 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7573s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7564s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0009s for     8192 events => throughput is 8.87E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7590s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7568s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0010s for     8192 events => throughput is 8.12E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0012s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +557,10 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07924 [7.9238470908598507E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1898 events (found 1903 events)
- [COUNTERS] PROGRAM TOTAL          :    1.9333s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.9227s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0106s for    90112 events => throughput is 8.47E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.9627s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.9510s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0105s for    90112 events => throughput is 8.59E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0011s
 
 *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -561,42 +573,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.129185e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.151184e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.549775e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.548948e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.539449e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.576425e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.726001e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.715469e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.546076e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.585156e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.761871e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.753005e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.365545e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.440113e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.281294e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.293588e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
index b3a8db7192..b6592dfe65 100644
--- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
@@ -2,21 +2,21 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g
 
 
 make USEBUILDDIR=1 BACKEND=cuda
+
 make USEBUILDDIR=1 BACKEND=cppnone
 
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make USEBUILDDIR=1 BACKEND=cppsse4
+make USEBUILDDIR=1 BACKEND=cppavx2
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-
-
-make USEBUILDDIR=1 BACKEND=cppavx2
-make USEBUILDDIR=1 BACKEND=cpp512y
 make USEBUILDDIR=1 BACKEND=cpp512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
@@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2024-06-28_21:18:54
+DATE: 2024-08-08_20:46:20
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0
  [UNWEIGHT] Wrote 365 events (found 1496 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6896s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3548s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3348s for     8192 events => throughput is 2.45E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6929s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3702s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3227s for     8192 events => throughput is 2.54E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0
  [UNWEIGHT] Wrote 386 events (found 1179 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6450s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3225s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3225s for     8192 events => throughput is 2.54E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6641s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3385s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3256s for     8192 events => throughput is 2.52E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 1898 events (found 1903 events)
- [COUNTERS] PROGRAM TOTAL          :    5.1061s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5000s
- [COUNTERS] Fortran MEs      ( 1 ) :    3.6061s for    90112 events => throughput is 2.50E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    5.1698s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5570s
+ [COUNTERS] Fortran MEs      ( 1 ) :    3.6128s for    90112 events => throughput is 2.49E+04 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,10 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.1011 [0.10112748700702684] fbridge_mode=1
  [UNWEIGHT] Wrote 386 events (found 1179 events)
- [COUNTERS] PROGRAM TOTAL          :    0.9893s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6448s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3444s for     8192 events => throughput is 2.38E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6766s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3338s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3416s for     8192 events => throughput is 2.40E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0011s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +167,10 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07924 [7.9238482679400354E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1898 events (found 1903 events)
- [COUNTERS] PROGRAM TOTAL          :    5.6402s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.8430s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    3.7972s for    90112 events => throughput is 2.37E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    5.3154s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5455s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    3.7687s for    90112 events => throughput is 2.39E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0012s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -181,12 +183,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.504502e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.463950e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.493736e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.478616e+04                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +211,10 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.1011 [0.10112748702805033] fbridge_mode=1
  [UNWEIGHT] Wrote 386 events (found 1179 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6596s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4865s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1730s for     8192 events => throughput is 4.73E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5103s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3345s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1749s for     8192 events => throughput is 4.68E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +245,10 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07924 [7.9238482683055667E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1898 events (found 1903 events)
- [COUNTERS] PROGRAM TOTAL          :    3.5611s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6531s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.9081s for    90112 events => throughput is 4.72E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    3.4746s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5384s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.9354s for    90112 events => throughput is 4.66E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -257,12 +261,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.885906e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.832626e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.454856e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.815562e+04                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +289,10 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.1011 [0.10112748681415580] fbridge_mode=1
  [UNWEIGHT] Wrote 386 events (found 1179 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5422s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4457s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0965s for     8192 events => throughput is 8.49E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4266s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3394s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0865s for     8192 events => throughput is 9.47E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0007s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +323,10 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07924 [7.9238482534347232E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1898 events (found 1903 events)
- [COUNTERS] PROGRAM TOTAL          :    2.8283s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.7674s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0609s for    90112 events => throughput is 8.49E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.4911s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5269s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9636s for    90112 events => throughput is 9.35E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0007s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -333,12 +339,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.150394e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.435081e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.706600e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.477580e+04                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +367,10 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.1011 [0.10112748681415580] fbridge_mode=1
  [UNWEIGHT] Wrote 386 events (found 1179 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5467s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4535s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0932s for     8192 events => throughput is 8.79E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4142s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3362s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0774s for     8192 events => throughput is 1.06E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +401,10 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07924 [7.9238482534347232E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1898 events (found 1903 events)
- [COUNTERS] PROGRAM TOTAL          :    2.5615s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6701s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8914s for    90112 events => throughput is 1.01E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.3905s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5342s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8556s for    90112 events => throughput is 1.05E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -409,12 +417,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.036553e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.087061e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.038231e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.088736e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +445,10 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.1011 [0.10112748700265108] fbridge_mode=1
  [UNWEIGHT] Wrote 386 events (found 1179 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5849s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4623s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1226s for     8192 events => throughput is 6.68E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4463s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3356s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1100s for     8192 events => throughput is 7.45E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +479,10 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07924 [7.9238482666076374E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1898 events (found 1903 events)
- [COUNTERS] PROGRAM TOTAL          :    2.8307s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6135s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.2171s for    90112 events => throughput is 7.40E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.7724s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5419s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.2297s for    90112 events => throughput is 7.33E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -485,12 +495,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.036325e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.268797e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.017098e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.343356e+04                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +523,10 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.1011 [0.10112748601943165] fbridge_mode=1
  [UNWEIGHT] Wrote 386 events (found 1179 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7581s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7527s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0054s for     8192 events => throughput is 1.51E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7682s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7592s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0060s for     8192 events => throughput is 1.36E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0029s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +557,10 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07924 [7.9238481937154381E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1898 events (found 1903 events)
- [COUNTERS] PROGRAM TOTAL          :    1.9432s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.9204s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0228s for    90112 events => throughput is 3.95E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.9875s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.9612s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0233s for    90112 events => throughput is 3.86E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0029s
 
 *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -561,42 +573,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.628045e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.654166e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.092999e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.808330e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.285885e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.001990e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.235467e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.235577e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.248772e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.000218e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.246161e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.245999e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.266042e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.996930e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.742147e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.726284e+06                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
index a3214916d8..9f965c04b5 100644
--- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
@@ -4,8 +4,8 @@ make USEBUILDDIR=1 BACKEND=cuda
 
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make USEBUILDDIR=1 BACKEND=cppsse4
 
+make USEBUILDDIR=1 BACKEND=cppsse4
 make USEBUILDDIR=1 BACKEND=cppavx2
 
 make USEBUILDDIR=1 BACKEND=cpp512y
@@ -13,14 +13,14 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
@@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2024-06-28_21:19:38
+DATE: 2024-08-08_20:47:02
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.4632 [0.46320556621222242] fbridge_mode=0
  [UNWEIGHT] Wrote 11 events (found 187 events)
- [COUNTERS] PROGRAM TOTAL          :    4.3806s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2416s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.1390s for     8192 events => throughput is 1.98E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.5167s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2657s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.2511s for     8192 events => throughput is 1.93E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.4632 [0.46320556621222242] fbridge_mode=0
  [UNWEIGHT] Wrote 11 events (found 168 events)
- [COUNTERS] PROGRAM TOTAL          :    4.4016s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2413s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.1603s for     8192 events => throughput is 1.97E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.4866s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2643s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.2223s for     8192 events => throughput is 1.94E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.2256 [0.22558083266099815] fbridge_mode=0
  [UNWEIGHT] Wrote 18 events (found 294 events)
- [COUNTERS] PROGRAM TOTAL          :   47.5196s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.7600s
- [COUNTERS] Fortran MEs      ( 1 ) :   45.7596s for    90112 events => throughput is 1.97E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   48.4461s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8443s
+ [COUNTERS] Fortran MEs      ( 1 ) :   46.6018s for    90112 events => throughput is 1.93E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,10 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.4632 [0.46320556621222236] fbridge_mode=1
  [UNWEIGHT] Wrote 11 events (found 168 events)
- [COUNTERS] PROGRAM TOTAL          :    8.8038s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.4979s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    4.3060s for     8192 events => throughput is 1.90E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.6404s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2618s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    4.3690s for     8192 events => throughput is 1.88E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0096s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +167,10 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.2256 [0.22558083266099799] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 294 events)
- [COUNTERS] PROGRAM TOTAL          :   53.4475s
- [COUNTERS] Fortran Overhead ( 0 ) :    5.9535s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   47.4939s for    90112 events => throughput is 1.90E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   49.9380s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7954s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   48.1336s for    90112 events => throughput is 1.87E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0090s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -181,12 +183,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.922216e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.926413e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.959395e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.935484e+03                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +211,10 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.4632 [0.46320556621222236] fbridge_mode=1
  [UNWEIGHT] Wrote 11 events (found 168 events)
- [COUNTERS] PROGRAM TOTAL          :    4.7615s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.4839s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    2.2777s for     8192 events => throughput is 3.60E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.6125s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2606s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    2.3472s for     8192 events => throughput is 3.49E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0046s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +245,10 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.2256 [0.22558083266099785] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 294 events)
- [COUNTERS] PROGRAM TOTAL          :   30.6075s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.0936s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   26.5139s for    90112 events => throughput is 3.40E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   27.5257s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8027s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   25.7180s for    90112 events => throughput is 3.50E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0050s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -257,12 +261,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.496612e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.649842e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.539346e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.636818e+03                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +289,10 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.4632 [0.46320556621222231] fbridge_mode=1
  [UNWEIGHT] Wrote 11 events (found 168 events)
- [COUNTERS] PROGRAM TOTAL          :    2.2183s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2245s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9937s for     8192 events => throughput is 8.24E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.2653s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2598s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0030s for     8192 events => throughput is 8.17E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0026s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +323,10 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.2256 [0.22558083266099799] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 294 events)
- [COUNTERS] PROGRAM TOTAL          :   14.0676s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.7785s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   11.2891s for    90112 events => throughput is 7.98E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   12.8598s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7908s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   11.0665s for    90112 events => throughput is 8.14E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0025s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -333,12 +339,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.419384e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.344831e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.678823e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.416676e+03                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +367,10 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.4632 [0.46320556621222231] fbridge_mode=1
  [UNWEIGHT] Wrote 11 events (found 168 events)
- [COUNTERS] PROGRAM TOTAL          :    1.9878s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.1062s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8816s for     8192 events => throughput is 9.29E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.1673s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2599s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9051s for     8192 events => throughput is 9.05E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0023s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +401,10 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.2256 [0.22558083266099799] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 294 events)
- [COUNTERS] PROGRAM TOTAL          :   12.3962s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.6208s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    9.7754s for    90112 events => throughput is 9.22E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   11.7872s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8132s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    9.9717s for    90112 events => throughput is 9.04E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0022s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -409,12 +417,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.529916e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.472083e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.344439e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.534343e+03                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +445,10 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.4632 [0.46320556621222231] fbridge_mode=1
  [UNWEIGHT] Wrote 11 events (found 168 events)
- [COUNTERS] PROGRAM TOTAL          :    2.4488s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.3506s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0982s for     8192 events => throughput is 7.46E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3936s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2589s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1317s for     8192 events => throughput is 7.24E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0030s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +479,10 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.2256 [0.22558083266099799] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 294 events)
- [COUNTERS] PROGRAM TOTAL          :   15.0103s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.8885s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   12.1218s for    90112 events => throughput is 7.43E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   14.2691s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8171s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   12.4493s for    90112 events => throughput is 7.24E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0027s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -485,12 +495,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.477284e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.935643e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.503375e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.348983e+03                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +523,10 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.4632 [0.46320556621222225] fbridge_mode=1
  [UNWEIGHT] Wrote 11 events (found 168 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7764s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7438s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0326s for     8192 events => throughput is 2.51E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7693s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6983s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0363s for     8192 events => throughput is 2.26E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0347s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +557,10 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.2256 [0.22558083266099782] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 294 events)
- [COUNTERS] PROGRAM TOTAL          :    2.6460s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.2818s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3642s for    90112 events => throughput is 2.47E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.6062s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.2048s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3669s for    90112 events => throughput is 2.46E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0344s
 
 *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -561,42 +573,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.283334e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.290486e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.506901e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.506388e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.117711e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.134196e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.183664e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.177921e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.114395e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.129278e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.181079e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.155764e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.111113e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.126990e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.444874e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.446377e+05                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
index 565fe287ce..cd633f37c7 100644
--- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
@@ -4,8 +4,8 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g
 make USEBUILDDIR=1 BACKEND=cuda
 
 make USEBUILDDIR=1 BACKEND=cppnone
-
 make USEBUILDDIR=1 BACKEND=cppsse4
+
 make USEBUILDDIR=1 BACKEND=cppavx2
 
 make USEBUILDDIR=1 BACKEND=cpp512y
@@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2024-06-28_21:23:54
+DATE: 2024-08-08_20:51:00
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.4632 [0.46320556621222242] fbridge_mode=0
  [UNWEIGHT] Wrote 11 events (found 187 events)
- [COUNTERS] PROGRAM TOTAL          :    4.3980s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2420s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.1560s for     8192 events => throughput is 1.97E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.4959s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2635s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.2323s for     8192 events => throughput is 1.94E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.4632 [0.46320556621222242] fbridge_mode=0
  [UNWEIGHT] Wrote 11 events (found 168 events)
- [COUNTERS] PROGRAM TOTAL          :    4.4132s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2372s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.1760s for     8192 events => throughput is 1.96E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.4788s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2631s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.2156s for     8192 events => throughput is 1.94E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.2256 [0.22558083266099815] fbridge_mode=0
  [UNWEIGHT] Wrote 18 events (found 294 events)
- [COUNTERS] PROGRAM TOTAL          :   47.3387s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.7498s
- [COUNTERS] Fortran MEs      ( 1 ) :   45.5889s for    90112 events => throughput is 1.98E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   48.4352s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8357s
+ [COUNTERS] Fortran MEs      ( 1 ) :   46.5995s for    90112 events => throughput is 1.93E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,10 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.4632 [0.46320716615478996] fbridge_mode=1
  [UNWEIGHT] Wrote 11 events (found 168 events)
- [COUNTERS] PROGRAM TOTAL          :    8.4803s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.3064s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    4.1739s for     8192 events => throughput is 1.96E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.5354s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2660s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    4.2605s for     8192 events => throughput is 1.92E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0089s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +167,10 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.2256 [0.22558162567940870] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 294 events)
- [COUNTERS] PROGRAM TOTAL          :   52.0102s
- [COUNTERS] Fortran Overhead ( 0 ) :    5.8677s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   46.1425s for    90112 events => throughput is 1.95E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   48.5468s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7982s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   46.7401s for    90112 events => throughput is 1.93E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0085s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -181,12 +183,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.015479e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.996945e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.013117e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.982014e+03                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +211,10 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.4632 [0.46320708851010073] fbridge_mode=1
  [UNWEIGHT] Wrote 11 events (found 168 events)
- [COUNTERS] PROGRAM TOTAL          :    2.5680s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.4066s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1614s for     8192 events => throughput is 7.05E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4573s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2634s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1914s for     8192 events => throughput is 6.88E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0025s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +245,10 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.2256 [0.22558157380141428] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 294 events)
- [COUNTERS] PROGRAM TOTAL          :   15.5194s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.8862s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   12.6332s for    90112 events => throughput is 7.13E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   14.6570s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7854s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   12.8693s for    90112 events => throughput is 7.00E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0024s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -257,12 +261,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.320236e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.255598e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.204448e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.246435e+03                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +289,10 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.4632 [0.46320704806184321] fbridge_mode=1
  [UNWEIGHT] Wrote 11 events (found 168 events)
- [COUNTERS] PROGRAM TOTAL          :    1.2285s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7290s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4995s for     8192 events => throughput is 1.64E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7739s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2587s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5137s for     8192 events => throughput is 1.59E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0015s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +323,10 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.2256 [0.22558158459897135] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 294 events)
- [COUNTERS] PROGRAM TOTAL          :    7.7643s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.2713s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    5.4931s for    90112 events => throughput is 1.64E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    7.4672s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7991s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    5.6666s for    90112 events => throughput is 1.59E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0014s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -333,12 +339,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.674256e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.606140e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.684230e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.576957e+04                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +367,10 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.4632 [0.46320704806184321] fbridge_mode=1
  [UNWEIGHT] Wrote 11 events (found 168 events)
- [COUNTERS] PROGRAM TOTAL          :    1.1706s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6998s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4708s for     8192 events => throughput is 1.74E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7680s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2709s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4957s for     8192 events => throughput is 1.65E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0013s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +401,10 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.2256 [0.22558158459897135] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 294 events)
- [COUNTERS] PROGRAM TOTAL          :    7.1094s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.2054s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    4.9040s for    90112 events => throughput is 1.84E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    6.7809s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7804s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    4.9992s for    90112 events => throughput is 1.80E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0014s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -409,12 +417,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.876397e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.849666e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.833562e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.858554e+04                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +445,10 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.4632 [0.46320713685871445] fbridge_mode=1
  [UNWEIGHT] Wrote 11 events (found 168 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4404s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8351s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.6053s for     8192 events => throughput is 1.35E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8187s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2599s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5571s for     8192 events => throughput is 1.47E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0017s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +479,10 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.2256 [0.22558162184774774] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 294 events)
- [COUNTERS] PROGRAM TOTAL          :    8.3122s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.3041s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    6.0081s for    90112 events => throughput is 1.50E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    7.9104s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7899s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    6.1190s for    90112 events => throughput is 1.47E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0015s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -485,12 +495,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.515129e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.496224e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.524953e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.504281e+04                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +523,10 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.4632 [0.46320719394836651] fbridge_mode=1
  [UNWEIGHT] Wrote 11 events (found 168 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7481s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7253s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0228s for     8192 events => throughput is 3.60E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7396s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6908s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0246s for     8192 events => throughput is 3.32E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0242s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +557,10 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.2256 [0.22558167135091578] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 294 events)
- [COUNTERS] PROGRAM TOTAL          :    2.4757s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.2273s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2484s for    90112 events => throughput is 3.63E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.4680s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.1917s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2521s for    90112 events => throughput is 3.57E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0241s
 
 *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -561,42 +573,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.375249e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.382988e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.741898e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.717142e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.119355e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.139748e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.307435e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.304954e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.151709e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.085623e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.303009e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.300454e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.049512e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.130448e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.396866e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.397157e+05                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
index 93675b1fbf..27512be658 100644
--- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
@@ -1,9 +1,9 @@
 Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 
 
+make USEBUILDDIR=1 BACKEND=cuda
 
 
-make USEBUILDDIR=1 BACKEND=cuda
 make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
 make USEBUILDDIR=1 BACKEND=cppavx2
@@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2024-06-28_21:27:14
+DATE: 2024-08-08_20:54:10
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.4632 [0.46320556621222242] fbridge_mode=0
  [UNWEIGHT] Wrote 11 events (found 187 events)
- [COUNTERS] PROGRAM TOTAL          :    4.3736s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2409s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.1327s for     8192 events => throughput is 1.98E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.4700s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2619s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.2081s for     8192 events => throughput is 1.95E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.4632 [0.46320556621222242] fbridge_mode=0
  [UNWEIGHT] Wrote 11 events (found 168 events)
- [COUNTERS] PROGRAM TOTAL          :    4.3735s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2382s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.1353s for     8192 events => throughput is 1.98E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.4683s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2604s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.2079s for     8192 events => throughput is 1.95E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.2256 [0.22558083266099815] fbridge_mode=0
  [UNWEIGHT] Wrote 18 events (found 294 events)
- [COUNTERS] PROGRAM TOTAL          :   47.4052s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.7410s
- [COUNTERS] Fortran MEs      ( 1 ) :   45.6641s for    90112 events => throughput is 1.97E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   48.3196s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8366s
+ [COUNTERS] Fortran MEs      ( 1 ) :   46.4830s for    90112 events => throughput is 1.94E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,10 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.4632 [0.46320556893412546] fbridge_mode=1
  [UNWEIGHT] Wrote 11 events (found 168 events)
- [COUNTERS] PROGRAM TOTAL          :    8.8075s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.4671s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    4.3405s for     8192 events => throughput is 1.89E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.6760s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2586s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    4.4088s for     8192 events => throughput is 1.86E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0086s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +167,10 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.2256 [0.22558083370546855] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 294 events)
- [COUNTERS] PROGRAM TOTAL          :   53.8850s
- [COUNTERS] Fortran Overhead ( 0 ) :    5.9787s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   47.9063s for    90112 events => throughput is 1.88E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   50.5724s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8031s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   48.7604s for    90112 events => throughput is 1.85E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0089s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -181,12 +183,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.948359e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.909521e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.951349e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.899981e+03                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +211,10 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.4632 [0.46320556780656974] fbridge_mode=1
  [UNWEIGHT] Wrote 11 events (found 168 events)
- [COUNTERS] PROGRAM TOTAL          :    4.7620s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.4646s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    2.2974s for     8192 events => throughput is 3.57E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.5687s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2576s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    2.3063s for     8192 events => throughput is 3.55E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0048s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +245,10 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.2256 [0.22558083390630859] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 294 events)
- [COUNTERS] PROGRAM TOTAL          :   29.1849s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.0005s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   25.1844s for    90112 events => throughput is 3.58E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   27.4318s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7915s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   25.6356s for    90112 events => throughput is 3.52E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0047s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -257,12 +261,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.700037e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.646364e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.594428e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.634455e+03                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +289,10 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.4632 [0.46320556770726795] fbridge_mode=1
  [UNWEIGHT] Wrote 11 events (found 168 events)
- [COUNTERS] PROGRAM TOTAL          :    2.2008s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2107s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9900s for     8192 events => throughput is 8.27E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.2686s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2604s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0056s for     8192 events => throughput is 8.15E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0025s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +323,10 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.2256 [0.22558083379720220] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 294 events)
- [COUNTERS] PROGRAM TOTAL          :   13.6123s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.7227s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   10.8896s for    90112 events => throughput is 8.28E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   12.9032s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7920s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   11.1088s for    90112 events => throughput is 8.11E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0024s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -333,12 +339,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.547846e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.153831e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.522651e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.410165e+03                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +367,10 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.4632 [0.46320556770726795] fbridge_mode=1
  [UNWEIGHT] Wrote 11 events (found 168 events)
- [COUNTERS] PROGRAM TOTAL          :    1.9517s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.0869s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8648s for     8192 events => throughput is 9.47E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.1480s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2607s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8850s for     8192 events => throughput is 9.26E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0023s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +401,10 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.2256 [0.22558083379720220] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 294 events)
- [COUNTERS] PROGRAM TOTAL          :   12.1428s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.5992s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    9.5436s for    90112 events => throughput is 9.44E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   11.5478s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7830s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    9.7625s for    90112 events => throughput is 9.23E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0024s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -409,12 +417,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.818655e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.509937e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.752589e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.503575e+03                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +445,10 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.4632 [0.46320556770726795] fbridge_mode=1
  [UNWEIGHT] Wrote 11 events (found 168 events)
- [COUNTERS] PROGRAM TOTAL          :    2.4530s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.3428s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1102s for     8192 events => throughput is 7.38E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3881s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2592s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1259s for     8192 events => throughput is 7.28E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0031s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +479,10 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.2256 [0.22558083379720220] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 294 events)
- [COUNTERS] PROGRAM TOTAL          :   16.4311s
- [COUNTERS] Fortran Overhead ( 0 ) :    3.0180s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   13.4132s for    90112 events => throughput is 6.72E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   14.4378s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7995s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   12.6355s for    90112 events => throughput is 7.13E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0028s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -485,12 +495,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.834157e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.378664e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.943776e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.252552e+03                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +523,10 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.4632 [0.46320556665261842] fbridge_mode=1
  [UNWEIGHT] Wrote 11 events (found 168 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7919s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7589s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0330s for     8192 events => throughput is 2.48E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7612s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6909s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0360s for     8192 events => throughput is 2.27E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0343s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +557,10 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.2256 [0.22558083224243403] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 294 events)
- [COUNTERS] PROGRAM TOTAL          :    2.7451s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.3819s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3632s for    90112 events => throughput is 2.48E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.5943s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.1940s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3660s for    90112 events => throughput is 2.46E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0343s
 
 *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -561,42 +573,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.281497e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.292672e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.508846e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.513091e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.120326e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.132768e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.183615e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.151465e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.108754e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.134281e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.168874e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.177596e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.121179e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.130147e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.453826e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.451952e+05                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
index a3ac7b5c1f..dab5f736a0 100644
--- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
@@ -3,17 +3,17 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g
 
 make USEBUILDDIR=1 BACKEND=cuda
 
+
 make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
-
 make USEBUILDDIR=1 BACKEND=cppavx2
+
+make USEBUILDDIR=1 BACKEND=cpp512y
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-make USEBUILDDIR=1 BACKEND=cpp512y
-
 make USEBUILDDIR=1 BACKEND=cpp512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
@@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2024-06-28_21:32:46
+DATE: 2024-08-08_20:59:36
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0
  [UNWEIGHT] Wrote 1 events (found 407 events)
- [COUNTERS] PROGRAM TOTAL          :  101.2642s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5189s
- [COUNTERS] Fortran MEs      ( 1 ) :  100.7453s for     8192 events => throughput is 8.13E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :  102.0811s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5116s
+ [COUNTERS] Fortran MEs      ( 1 ) :  101.5694s for     8192 events => throughput is 8.07E+01 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0
  [UNWEIGHT] Wrote 70 events (found 407 events)
- [COUNTERS] PROGRAM TOTAL          :   98.9549s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5002s
- [COUNTERS] Fortran MEs      ( 1 ) :   98.4548s for     8192 events => throughput is 8.32E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :  102.0739s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5163s
+ [COUNTERS] Fortran MEs      ( 1 ) :  101.5576s for     8192 events => throughput is 8.07E+01 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.332e-07 [2.3322993086655967E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 303 events (found 1531 events)
- [COUNTERS] PROGRAM TOTAL          : 1078.9193s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.3729s
- [COUNTERS] Fortran MEs      ( 1 ) : 1074.5464s for    90112 events => throughput is 8.39E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          : 1120.7697s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.3745s
+ [COUNTERS] Fortran MEs      ( 1 ) : 1116.3951s for    90112 events => throughput is 8.07E+01 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,10 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.24e-06 [1.2403985227939193E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 70 events (found 407 events)
- [COUNTERS] PROGRAM TOTAL          :  226.9636s
- [COUNTERS] Fortran Overhead ( 0 ) :  104.1252s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  122.8384s for     8192 events => throughput is 6.67E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :  122.6268s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5175s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  121.9186s for     8192 events => throughput is 6.72E+01 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.1907s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +167,10 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 303 events (found 1531 events)
- [COUNTERS] PROGRAM TOTAL          : 1453.2216s
- [COUNTERS] Fortran Overhead ( 0 ) :  108.8256s
- [COUNTERS] CudaCpp MEs      ( 2 ) : 1344.3960s for    90112 events => throughput is 6.70E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          : 1388.7153s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.3988s
+ [COUNTERS] CudaCpp MEs      ( 2 ) : 1384.1234s for    90112 events => throughput is 6.51E+01 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.1931s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -181,12 +183,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.957500e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.880201e+01                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.966466e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.389775e+01                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +211,10 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.24e-06 [1.2403985227939197E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 70 events (found 407 events)
- [COUNTERS] PROGRAM TOTAL          :  110.0381s
- [COUNTERS] Fortran Overhead ( 0 ) :   50.3711s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   59.6670s for     8192 events => throughput is 1.37E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   60.8180s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5182s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   60.1993s for     8192 events => throughput is 1.36E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.1005s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +245,10 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.332e-07 [2.3322993086656017E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 303 events (found 1531 events)
- [COUNTERS] PROGRAM TOTAL          :  713.5237s
- [COUNTERS] Fortran Overhead ( 0 ) :   54.3392s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  659.1845s for    90112 events => throughput is 1.37E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  663.6261s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.4076s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  659.1171s for    90112 events => throughput is 1.37E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.1014s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -257,12 +261,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.640648e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.603881e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.642221e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.607115e+02                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +289,10 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 70 events (found 407 events)
- [COUNTERS] PROGRAM TOTAL          :   51.6707s
- [COUNTERS] Fortran Overhead ( 0 ) :   23.8142s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   27.8565s for     8192 events => throughput is 2.94E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   28.7968s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5160s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   28.2344s for     8192 events => throughput is 2.90E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0464s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +323,10 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 303 events (found 1531 events)
- [COUNTERS] PROGRAM TOTAL          :  333.1028s
- [COUNTERS] Fortran Overhead ( 0 ) :   27.4583s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  305.6444s for    90112 events => throughput is 2.95E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  314.6312s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.4324s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  310.1525s for    90112 events => throughput is 2.91E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0464s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -333,12 +339,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.542437e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.378917e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.558618e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.496128e+02                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +367,10 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 70 events (found 407 events)
- [COUNTERS] PROGRAM TOTAL          :   45.2154s
- [COUNTERS] Fortran Overhead ( 0 ) :   20.7378s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   24.4776s for     8192 events => throughput is 3.35E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   25.3254s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5203s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   24.7644s for     8192 events => throughput is 3.31E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0408s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +401,10 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 303 events (found 1531 events)
- [COUNTERS] PROGRAM TOTAL          :  293.9503s
- [COUNTERS] Fortran Overhead ( 0 ) :   24.5760s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  269.3742s for    90112 events => throughput is 3.35E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  277.9808s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.4083s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  273.5305s for    90112 events => throughput is 3.29E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0420s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -409,12 +417,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.038194e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.986386e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.054728e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.006448e+02                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +445,10 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 70 events (found 407 events)
- [COUNTERS] PROGRAM TOTAL          :   46.6506s
- [COUNTERS] Fortran Overhead ( 0 ) :   22.6220s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   24.0285s for     8192 events => throughput is 3.41E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   25.0869s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5172s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   24.5238s for     8192 events => throughput is 3.34E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0459s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +479,10 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 303 events (found 1531 events)
- [COUNTERS] PROGRAM TOTAL          :  291.3320s
- [COUNTERS] Fortran Overhead ( 0 ) :   26.5806s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  264.7513s for    90112 events => throughput is 3.40E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  271.0840s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.3948s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  266.6404s for    90112 events => throughput is 3.38E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0489s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -485,12 +495,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.666698e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.641160e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.681846e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.622116e+02                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +523,10 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.24e-06 [1.2403985227939195E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 70 events (found 407 events)
- [COUNTERS] PROGRAM TOTAL          :    4.2374s
- [COUNTERS] Fortran Overhead ( 0 ) :    3.1562s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0812s for     8192 events => throughput is 7.58E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    3.2426s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.0583s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0970s for     8192 events => throughput is 7.47E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    1.0873s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +557,10 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.332e-07 [2.3322993086656006E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 303 events (found 1531 events)
- [COUNTERS] PROGRAM TOTAL          :   18.7608s
- [COUNTERS] Fortran Overhead ( 0 ) :    6.8530s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   11.9078s for    90112 events => throughput is 7.57E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   17.9203s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.9107s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   11.9249s for    90112 events => throughput is 7.56E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    1.0847s
 
 *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -561,42 +573,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.502460e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.521131e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.309158e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.292650e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.222869e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.241733e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 512 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.577320e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.585186e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.238055e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.235154e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.437122e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.473644e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.224755e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.236111e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.231636e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.235762e+03                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
index a539e33f24..4ffdbee10a 100644
--- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
@@ -4,8 +4,8 @@ make USEBUILDDIR=1 BACKEND=cuda
 
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make USEBUILDDIR=1 BACKEND=cppsse4
 
+make USEBUILDDIR=1 BACKEND=cppsse4
 make USEBUILDDIR=1 BACKEND=cppavx2
 
 make USEBUILDDIR=1 BACKEND=cpp512y
@@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
@@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2024-06-28_23:01:56
+DATE: 2024-08-08_22:23:03
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0
  [UNWEIGHT] Wrote 1 events (found 407 events)
- [COUNTERS] PROGRAM TOTAL          :   97.1861s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4946s
- [COUNTERS] Fortran MEs      ( 1 ) :   96.6914s for     8192 events => throughput is 8.47E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :  101.3873s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5075s
+ [COUNTERS] Fortran MEs      ( 1 ) :  100.8798s for     8192 events => throughput is 8.12E+01 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0
  [UNWEIGHT] Wrote 70 events (found 407 events)
- [COUNTERS] PROGRAM TOTAL          :   96.7329s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4973s
- [COUNTERS] Fortran MEs      ( 1 ) :   96.2355s for     8192 events => throughput is 8.51E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :  102.2416s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5135s
+ [COUNTERS] Fortran MEs      ( 1 ) :  101.7281s for     8192 events => throughput is 8.05E+01 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.332e-07 [2.3322993086655967E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 303 events (found 1531 events)
- [COUNTERS] PROGRAM TOTAL          : 1066.1378s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.3607s
- [COUNTERS] Fortran MEs      ( 1 ) : 1061.7771s for    90112 events => throughput is 8.49E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          : 1114.7300s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.3428s
+ [COUNTERS] Fortran MEs      ( 1 ) : 1110.3872s for    90112 events => throughput is 8.12E+01 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,10 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.241e-06 [1.2405719945779552E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 70 events (found 407 events)
- [COUNTERS] PROGRAM TOTAL          :  204.5576s
- [COUNTERS] Fortran Overhead ( 0 ) :   94.7222s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  109.8354s for     8192 events => throughput is 7.46E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :  111.0089s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5100s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  110.3187s for     8192 events => throughput is 7.43E+01 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.1802s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -168,9 +169,10 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.333e-07 [2.3326290777570335E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 303 events (found 1531 events)
- [COUNTERS] PROGRAM TOTAL          : 1302.1844s
- [COUNTERS] Fortran Overhead ( 0 ) :   96.3064s
- [COUNTERS] CudaCpp MEs      ( 2 ) : 1205.8781s for    90112 events => throughput is 7.47E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          : 1216.8479s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.4035s
+ [COUNTERS] CudaCpp MEs      ( 2 ) : 1212.2644s for    90112 events => throughput is 7.43E+01 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.1800s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -183,12 +185,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.892403e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.795452e+01                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.896902e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.783118e+01                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -212,9 +214,10 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.241e-06 [1.2405716994349971E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 70 events (found 407 events)
- [COUNTERS] PROGRAM TOTAL          :   50.4740s
- [COUNTERS] Fortran Overhead ( 0 ) :   23.8797s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   26.5944s for     8192 events => throughput is 3.08E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   27.4750s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5164s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   26.9120s for     8192 events => throughput is 3.04E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0465s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -246,9 +249,10 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.333e-07 [2.3326284885505778E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 303 events (found 1531 events)
- [COUNTERS] PROGRAM TOTAL          :  322.1801s
- [COUNTERS] Fortran Overhead ( 0 ) :   27.6823s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  294.4978s for    90112 events => throughput is 3.06E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  300.8248s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.4082s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  296.3700s for    90112 events => throughput is 3.04E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0466s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -261,12 +265,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.518666e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.485944e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.484203e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.470723e+02                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -290,9 +294,10 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.241e-06 [1.2405716646933743E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 70 events (found 407 events)
- [COUNTERS] PROGRAM TOTAL          :   26.5623s
- [COUNTERS] Fortran Overhead ( 0 ) :   12.2724s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   14.2899s for     8192 events => throughput is 5.73E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   14.5936s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5183s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   14.0522s for     8192 events => throughput is 5.83E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0231s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -324,9 +329,10 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.333e-07 [2.3326277033163402E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 303 events (found 1531 events)
- [COUNTERS] PROGRAM TOTAL          :  174.5012s
- [COUNTERS] Fortran Overhead ( 0 ) :   16.5646s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  157.9366s for    90112 events => throughput is 5.71E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  158.5014s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.4348s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  154.0430s for    90112 events => throughput is 5.85E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0236s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -339,12 +345,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.701727e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.991558e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.718879e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.952358e+02                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -368,9 +374,10 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.241e-06 [1.2405716646933743E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 70 events (found 407 events)
- [COUNTERS] PROGRAM TOTAL          :   23.9391s
- [COUNTERS] Fortran Overhead ( 0 ) :   11.1292s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   12.8099s for     8192 events => throughput is 6.40E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   12.8606s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5199s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   12.3203s for     8192 events => throughput is 6.65E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0204s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -402,9 +409,10 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.333e-07 [2.3326277033163402E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 303 events (found 1531 events)
- [COUNTERS] PROGRAM TOTAL          :  155.8533s
- [COUNTERS] Fortran Overhead ( 0 ) :   14.9625s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  140.8908s for    90112 events => throughput is 6.40E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  139.5398s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.3981s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  135.1212s for    90112 events => throughput is 6.67E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0205s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -417,12 +425,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.716591e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.890802e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.738216e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.069181e+02                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -446,9 +454,10 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.241e-06 [1.2405719257109645E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 70 events (found 407 events)
- [COUNTERS] PROGRAM TOTAL          :   25.2840s
- [COUNTERS] Fortran Overhead ( 0 ) :   12.4664s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   12.8176s for     8192 events => throughput is 6.39E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   12.8130s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5166s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   12.2739s for     8192 events => throughput is 6.67E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0225s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -480,9 +489,10 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.333e-07 [2.3326283665697276E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 303 events (found 1531 events)
- [COUNTERS] PROGRAM TOTAL          :  149.3053s
- [COUNTERS] Fortran Overhead ( 0 ) :   16.1390s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  133.1664s for    90112 events => throughput is 6.77E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  139.5916s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.4260s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  135.1428s for    90112 events => throughput is 6.67E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0228s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -495,12 +505,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.340832e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.223008e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.256949e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.135239e+02                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -523,9 +533,10 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.241e-06 [1.2405721007137020E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 70 events (found 407 events)
- [COUNTERS] PROGRAM TOTAL          :    2.5997s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.0687s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5310s for     8192 events => throughput is 1.54E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.1089s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.0215s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5332s for     8192 events => throughput is 1.54E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.5542s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -556,9 +567,10 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.333e-07 [2.3326295421688232E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 303 events (found 1531 events)
- [COUNTERS] PROGRAM TOTAL          :   11.4660s
- [COUNTERS] Fortran Overhead ( 0 ) :    5.7555s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    5.7106s for    90112 events => throughput is 1.58E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :   11.2844s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.8851s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    5.8421s for    90112 events => throughput is 1.54E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.5572s
 
 *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -571,42 +583,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.530700e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.533878e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.545413e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.547825e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.163666e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.147653e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 512 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.148478e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.124611e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.113035e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.134315e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.205309e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.131039e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.129848e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.139642e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.016707e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.021489e+03                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
index 78332de82a..e8248fddca 100644
--- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
@@ -1,10 +1,10 @@
 Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 
-make USEBUILDDIR=1 BACKEND=cuda
-
 
+make USEBUILDDIR=1 BACKEND=cuda
 
 make USEBUILDDIR=1 BACKEND=cppnone
+
 make USEBUILDDIR=1 BACKEND=cppsse4
 make USEBUILDDIR=1 BACKEND=cppavx2
 
@@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
@@ -24,15 +24,15 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
 OMP_NUM_THREADS=
 
-DATE: 2024-06-29_00:09:11
+DATE: 2024-08-08_23:26:17
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0
  [UNWEIGHT] Wrote 1 events (found 407 events)
- [COUNTERS] PROGRAM TOTAL          :   96.7022s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4938s
- [COUNTERS] Fortran MEs      ( 1 ) :   96.2084s for     8192 events => throughput is 8.51E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :  103.0122s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5139s
+ [COUNTERS] Fortran MEs      ( 1 ) :  102.4983s for     8192 events => throughput is 7.99E+01 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0
  [UNWEIGHT] Wrote 70 events (found 407 events)
- [COUNTERS] PROGRAM TOTAL          :   96.6669s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5001s
- [COUNTERS] Fortran MEs      ( 1 ) :   96.1668s for     8192 events => throughput is 8.52E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :  101.2993s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5294s
+ [COUNTERS] Fortran MEs      ( 1 ) :  100.7699s for     8192 events => throughput is 8.13E+01 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.332e-07 [2.3322993086655967E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 303 events (found 1531 events)
- [COUNTERS] PROGRAM TOTAL          : 1066.3936s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.3326s
- [COUNTERS] Fortran MEs      ( 1 ) : 1062.0609s for    90112 events => throughput is 8.48E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          : 1118.7642s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.3619s
+ [COUNTERS] Fortran MEs      ( 1 ) : 1114.4022s for    90112 events => throughput is 8.09E+01 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,10 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.24e-06 [1.2403985299359844E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 70 events (found 407 events)
- [COUNTERS] PROGRAM TOTAL          :  227.5404s
- [COUNTERS] Fortran Overhead ( 0 ) :  104.7327s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  122.8076s for     8192 events => throughput is 6.67E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :  125.7885s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5193s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  125.0621s for     8192 events => throughput is 6.55E+01 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.2071s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +167,10 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.332e-07 [2.3322993212353001E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 303 events (found 1531 events)
- [COUNTERS] PROGRAM TOTAL          : 1460.6766s
- [COUNTERS] Fortran Overhead ( 0 ) :  108.1514s
- [COUNTERS] CudaCpp MEs      ( 2 ) : 1352.5253s for    90112 events => throughput is 6.66E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          : 1322.8827s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.3903s
+ [COUNTERS] CudaCpp MEs      ( 2 ) : 1318.2870s for    90112 events => throughput is 6.84E+01 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.2054s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -181,12 +183,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.953346e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.761597e+01                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.887442e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.724704e+01                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +211,10 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.24e-06 [1.2403985295828471E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 70 events (found 407 events)
- [COUNTERS] PROGRAM TOTAL          :  113.0177s
- [COUNTERS] Fortran Overhead ( 0 ) :   51.7429s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   61.2748s for     8192 events => throughput is 1.34E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   62.4510s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5155s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   61.8333s for     8192 events => throughput is 1.32E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.1022s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +245,10 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.332e-07 [2.3322993222645653E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 303 events (found 1531 events)
- [COUNTERS] PROGRAM TOTAL          :  722.6357s
- [COUNTERS] Fortran Overhead ( 0 ) :   55.1190s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  667.5167s for    90112 events => throughput is 1.35E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  684.8121s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.4198s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  680.2921s for    90112 events => throughput is 1.32E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.1003s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -257,12 +261,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.603521e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.589042e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.594140e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.588931e+02                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +289,10 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 70 events (found 407 events)
- [COUNTERS] PROGRAM TOTAL          :   49.4491s
- [COUNTERS] Fortran Overhead ( 0 ) :   22.5625s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   26.8866s for     8192 events => throughput is 3.05E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   27.0092s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5181s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   26.4459s for     8192 events => throughput is 3.10E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0452s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +323,10 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 303 events (found 1531 events)
- [COUNTERS] PROGRAM TOTAL          :  323.2367s
- [COUNTERS] Fortran Overhead ( 0 ) :   26.2106s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  297.0260s for    90112 events => throughput is 3.03E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  298.0409s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.4173s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  293.5790s for    90112 events => throughput is 3.07E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0445s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -333,12 +339,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.746482e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.648206e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.754234e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.625373e+02                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +367,10 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 70 events (found 407 events)
- [COUNTERS] PROGRAM TOTAL          :   43.5060s
- [COUNTERS] Fortran Overhead ( 0 ) :   19.6610s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   23.8450s for     8192 events => throughput is 3.44E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   24.3540s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5168s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   23.7936s for     8192 events => throughput is 3.44E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0436s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +401,10 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 303 events (found 1531 events)
- [COUNTERS] PROGRAM TOTAL          :  285.3812s
- [COUNTERS] Fortran Overhead ( 0 ) :   23.3907s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  261.9904s for    90112 events => throughput is 3.44E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  269.6777s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.4164s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  265.2234s for    90112 events => throughput is 3.40E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0378s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -409,12 +417,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.296352e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.285493e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.310764e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.289545e+02                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +445,10 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 70 events (found 407 events)
- [COUNTERS] PROGRAM TOTAL          :   45.6608s
- [COUNTERS] Fortran Overhead ( 0 ) :   22.1706s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   23.4902s for     8192 events => throughput is 3.49E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :   25.1227s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5145s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   24.5642s for     8192 events => throughput is 3.33E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0441s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +479,10 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 303 events (found 1531 events)
- [COUNTERS] PROGRAM TOTAL          :  283.0596s
- [COUNTERS] Fortran Overhead ( 0 ) :   25.7915s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  257.2681s for    90112 events => throughput is 3.50E+02 events/s
+ [COUNTERS] PROGRAM TOTAL          :  274.1583s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.4200s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  269.6946s for    90112 events => throughput is 3.34E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0436s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -485,12 +495,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.790026e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.625912e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.795378e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.662510e+02                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +523,10 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 1.24e-06 [1.2403985217419736E-006] fbridge_mode=1
  [UNWEIGHT] Wrote 70 events (found 407 events)
- [COUNTERS] PROGRAM TOTAL          :    3.5887s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.7249s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8638s for     8192 events => throughput is 9.48E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.7717s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.0261s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8763s for     8192 events => throughput is 9.35E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.8694s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +557,10 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.332e-07 [2.3322993078576733E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 303 events (found 1531 events)
- [COUNTERS] PROGRAM TOTAL          :   15.9655s
- [COUNTERS] Fortran Overhead ( 0 ) :    6.4579s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    9.5077s for    90112 events => throughput is 9.48E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   15.2659s
+ [COUNTERS] Fortran Overhead ( 0 ) :    4.8943s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    9.5013s for    90112 events => throughput is 9.48E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.8704s
 
 *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -561,42 +573,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.454304e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.434661e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.092578e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.089765e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.107884e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.112116e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 512 32 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.159186e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.160890e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.108717e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.108390e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.111609e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.111312e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.112293e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.109990e+04                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 ***
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.645128e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.638783e+03                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
index 5750f0dd36..b877c26fea 100644
--- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
@@ -1,22 +1,22 @@
 Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
 
-
 make USEBUILDDIR=1 BACKEND=cuda
 
+
+
 make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
-
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-make USEBUILDDIR=1 BACKEND=cpp512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
@@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2024-06-28_21:31:29
+DATE: 2024-08-08_20:58:09
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0
  [UNWEIGHT] Wrote 404 events (found 1817 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4791s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4063s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0728s for     8192 events => throughput is 1.13E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4754s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4051s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0703s for     8192 events => throughput is 1.16E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0
  [UNWEIGHT] Wrote 404 events (found 1228 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4114s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3366s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0748s for     8192 events => throughput is 1.10E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4153s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3445s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0708s for     8192 events => throughput is 1.16E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=0
  [UNWEIGHT] Wrote 1939 events (found 1944 events)
- [COUNTERS] PROGRAM TOTAL          :    2.3443s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5396s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.8047s for    90112 events => throughput is 1.12E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.3303s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5573s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.7730s for    90112 events => throughput is 1.17E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,10 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2711 [0.27110539351263335] fbridge_mode=1
  [UNWEIGHT] Wrote 404 events (found 1228 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4894s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4114s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0780s for     8192 events => throughput is 1.05E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4189s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3418s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0764s for     8192 events => throughput is 1.07E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +167,10 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1
  [UNWEIGHT] Wrote 1939 events (found 1944 events)
- [COUNTERS] PROGRAM TOTAL          :    2.4884s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6200s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8684s for    90112 events => throughput is 1.04E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.3766s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5374s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8384s for    90112 events => throughput is 1.07E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -181,12 +183,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.076463e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.104999e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.077689e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.080050e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +211,10 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2711 [0.27110539351262541] fbridge_mode=1
  [UNWEIGHT] Wrote 404 events (found 1228 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4183s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3754s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0429s for     8192 events => throughput is 1.91E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3875s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3450s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0419s for     8192 events => throughput is 1.96E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +245,10 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2151 [0.21510686556561281] fbridge_mode=1
  [UNWEIGHT] Wrote 1939 events (found 1944 events)
- [COUNTERS] PROGRAM TOTAL          :    2.0553s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5812s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4741s for    90112 events => throughput is 1.90E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.0024s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5394s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4624s for    90112 events => throughput is 1.95E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -257,12 +261,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.913652e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.937885e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.900215e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.972484e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +289,10 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1
  [UNWEIGHT] Wrote 404 events (found 1228 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3820s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3572s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0248s for     8192 events => throughput is 3.31E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3673s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3427s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0240s for     8192 events => throughput is 3.41E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +323,10 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1
  [UNWEIGHT] Wrote 1939 events (found 1944 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8264s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5516s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2748s for    90112 events => throughput is 3.28E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8108s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5445s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2657s for    90112 events => throughput is 3.39E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -333,12 +339,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.312529e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.384861e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.346229e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.378583e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +367,10 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1
  [UNWEIGHT] Wrote 404 events (found 1228 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3800s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3577s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0223s for     8192 events => throughput is 3.68E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3684s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3456s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0222s for     8192 events => throughput is 3.69E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +401,10 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1
  [UNWEIGHT] Wrote 1939 events (found 1944 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7983s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5529s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2454s for    90112 events => throughput is 3.67E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7798s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5417s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2375s for    90112 events => throughput is 3.79E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -409,12 +417,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.396153e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.465878e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.717666e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.626688e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +445,10 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1
  [UNWEIGHT] Wrote 404 events (found 1228 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4112s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3763s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0348s for     8192 events => throughput is 2.35E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3809s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3477s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0325s for     8192 events => throughput is 2.52E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0007s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +479,10 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1
  [UNWEIGHT] Wrote 1939 events (found 1944 events)
- [COUNTERS] PROGRAM TOTAL          :    2.0061s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6154s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3906s for    90112 events => throughput is 2.31E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8986s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5431s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3549s for    90112 events => throughput is 2.54E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -485,12 +495,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.408273e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.412835e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.448995e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.491870e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +523,10 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2711 [0.27110539351263363] fbridge_mode=1
  [UNWEIGHT] Wrote 404 events (found 1228 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7845s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7838s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.14E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7705s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7685s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0008s for     8192 events => throughput is 1.03E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0012s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +557,10 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2151 [0.21510686556561304] fbridge_mode=1
  [UNWEIGHT] Wrote 1939 events (found 1944 events)
- [COUNTERS] PROGRAM TOTAL          :    1.9715s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.9630s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0085s for    90112 events => throughput is 1.06E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.9737s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.9648s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0078s for    90112 events => throughput is 1.15E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0012s
 
 *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -561,42 +573,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.482506e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.555983e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.967685e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.037158e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.213762e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.629928e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.531197e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.566255e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.254372e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.636845e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.843988e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.850724e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.244977e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.619360e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.788434e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.790736e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
index 19656e6368..8ac388b886 100644
--- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
@@ -1,30 +1,30 @@
 Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
 
 
-
 make USEBUILDDIR=1 BACKEND=cuda
+
 make USEBUILDDIR=1 BACKEND=cppnone
+
 make USEBUILDDIR=1 BACKEND=cppsse4
+make USEBUILDDIR=1 BACKEND=cppavx2
 
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-make USEBUILDDIR=1 BACKEND=cppavx2
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-
-make USEBUILDDIR=1 BACKEND=cpp512y
 
 make USEBUILDDIR=1 BACKEND=cpp512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
@@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2024-06-28_21:31:59
+DATE: 2024-08-08_20:58:38
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0
  [UNWEIGHT] Wrote 404 events (found 1817 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4662s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3929s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0732s for     8192 events => throughput is 1.12E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4756s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4044s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0711s for     8192 events => throughput is 1.15E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0
  [UNWEIGHT] Wrote 404 events (found 1228 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4147s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3397s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0750s for     8192 events => throughput is 1.09E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4108s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3420s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0688s for     8192 events => throughput is 1.19E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=0
  [UNWEIGHT] Wrote 1939 events (found 1944 events)
- [COUNTERS] PROGRAM TOTAL          :    2.3268s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5225s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.8043s for    90112 events => throughput is 1.12E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.3245s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5525s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.7719s for    90112 events => throughput is 1.17E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,10 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2711 [0.27110463158198617] fbridge_mode=1
  [UNWEIGHT] Wrote 404 events (found 1228 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4778s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4049s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0729s for     8192 events => throughput is 1.12E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4137s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3419s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0712s for     8192 events => throughput is 1.15E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +167,10 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2151 [0.21510686347932190] fbridge_mode=1
  [UNWEIGHT] Wrote 1939 events (found 1944 events)
- [COUNTERS] PROGRAM TOTAL          :    2.4140s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6089s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8051s for    90112 events => throughput is 1.12E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.3233s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5375s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.7851s for    90112 events => throughput is 1.15E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0007s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -181,12 +183,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.138677e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.154270e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.134442e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.117776e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +211,10 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2711 [0.27110459183868807] fbridge_mode=1
  [UNWEIGHT] Wrote 404 events (found 1228 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3860s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3594s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0266s for     8192 events => throughput is 3.08E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3703s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3439s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0260s for     8192 events => throughput is 3.15E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +245,10 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2151 [0.21510683073685827] fbridge_mode=1
  [UNWEIGHT] Wrote 1939 events (found 1944 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8423s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5500s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2923s for    90112 events => throughput is 3.08E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8197s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5348s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2844s for    90112 events => throughput is 3.17E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -257,12 +261,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.036160e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.998738e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.064103e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.994620e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +289,10 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2711 [0.27110460727141733] fbridge_mode=1
  [UNWEIGHT] Wrote 404 events (found 1228 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3719s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3585s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0133s for     8192 events => throughput is 6.14E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3581s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3447s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0130s for     8192 events => throughput is 6.29E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +323,10 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2151 [0.21510682516942223] fbridge_mode=1
  [UNWEIGHT] Wrote 1939 events (found 1944 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6766s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5315s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1451s for    90112 events => throughput is 6.21E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6873s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5442s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1427s for    90112 events => throughput is 6.31E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -333,12 +339,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.245191e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.110364e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.259060e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.231132e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +367,10 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2711 [0.27110460727141733] fbridge_mode=1
  [UNWEIGHT] Wrote 404 events (found 1228 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4340s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4218s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0122s for     8192 events => throughput is 6.70E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3551s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3423s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0124s for     8192 events => throughput is 6.61E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +401,10 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2151 [0.21510682516942223] fbridge_mode=1
  [UNWEIGHT] Wrote 1939 events (found 1944 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6834s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5468s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1366s for    90112 events => throughput is 6.60E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6706s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5390s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1312s for    90112 events => throughput is 6.87E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -409,12 +417,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.674563e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.737889e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.747510e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.863785e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -429,38 +437,179 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192
 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
 --------------------
 Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 16/32
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 0.2711 [0.27110464220032526] fbridge_mode=1
+ [UNWEIGHT] Wrote 404 events (found 1228 events)
+ [COUNTERS] PROGRAM TOTAL          :    0.3592s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3420s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0167s for     8192 events => throughput is 4.91E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
+
+*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (0.27110539351263330) and cpp (0.27110464220032526) differ by less than 4E-4 (2.771292368253242e-06)
+
+*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
+
+*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 16/32
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 0.2151 [0.21510685471570221] fbridge_mode=1
+ [UNWEIGHT] Wrote 1939 events (found 1944 events)
+ [COUNTERS] PROGRAM TOTAL          :    1.7199s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5400s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1795s for    90112 events => throughput is 5.02E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
+
+*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (0.21510686556561295) and cpp (0.21510685471570221) differ by less than 4E-4 (5.043963013928732e-08)
+
+*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 4.872478e+05                 )  sec^-1
+
+*** EXECUTE CHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 4.938459e+05                 )  sec^-1
+
+*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+8192 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 16/32
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 0.2711 [0.27110477321990667] fbridge_mode=1
+ [UNWEIGHT] Wrote 404 events (found 1228 events)
+ [COUNTERS] PROGRAM TOTAL          :    0.7679s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7663s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.31E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0009s
+
+*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (0.27110539351263330) and cuda (0.27110477321990667) differ by less than 4E-4 (2.2880132283242816e-06)
+
+*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
+
+*** (3-cuda) EXECUTE MADEVENT_CUDA x10 (create events.lhe) ***
+--------------------
+CUDACPP_RUNTIME_FBRIDGEMODE = (not set)
+CUDACPP_RUNTIME_VECSIZEUSED = 8192
+--------------------
+81920 1 1 ! Number of events and max and min iterations
+0.000001 ! Accuracy (ignored because max iterations = min iterations)
+0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present)
+1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement)
+0 ! Helicity Sum/event 0=exact
+1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
+--------------------
+Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp'
+ [OPENMPTH] omp_get_max_threads/nproc = 1/4
+ [NGOODHEL] ngoodhel/ncomb = 16/32
+ [XSECTION] VECSIZE_USED = 8192
+ [XSECTION] MultiChannel = TRUE
+ [XSECTION] Configuration = 1
+ [XSECTION] ChannelId = 1
+ [XSECTION] Cross section = 0.2151 [0.21510689318513457] fbridge_mode=1
+ [UNWEIGHT] Wrote 1939 events (found 1944 events)
+ [COUNTERS] PROGRAM TOTAL          :    1.9690s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.9617s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0063s for    90112 events => throughput is 1.43E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0009s
+
+*** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
+
+OK! xsec from fortran (0.21510686556561295) and cuda (0.21510689318513457) differ by less than 4E-4 (1.2839907048700638e-07)
+
+*** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
+
+OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
+
+*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 2.567743e+07                 )  sec^-1
+
+*** EXECUTE GCHECK(8192) -p 256 32 1 ***
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 5.424411e+07                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 5.006580e+07                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 1.460162e+08                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 5.113271e+07                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 1.506902e+08                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 4.545880e+07                 )  sec^-1
+
+*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+EvtsPerSec[MECalcOnly] (3a) = ( 4.393633e+07                 )  sec^-1
+
+*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
-Program received signal SIGFPE: Floating-point exception - erroneous arithmetic operation.
-
-Backtrace for this error:
-#0  0x7f2ce7e23860 in ???
-#1  0x7f2ce7e22a05 in ???
-#2  0x7f2ce7a54def in ???
-#3  0x7f2ce84b810a in ???
-#4  0x7f2ce80f2575 in ???
-#5  0x7f2ce84b4c89 in ???
-#6  0x7f2ce84bebfd in ???
-#7  0x7f2ce84c4491 in ???
-#8  0x4300eb in ???
-#9  0x431c70 in ???
-#10  0x432da7 in ???
-#11  0x433b7e in ???
-#12  0x44a9c1 in ???
-#13  0x42ebdf in ???
-#14  0x40371e in ???
-#15  0x7f2ce7a3feaf in ???
-#16  0x7f2ce7a3ff5f in ???
-#17  0x403844 in ???
-#18  0xffffffffffffffff in ???
-./madX.sh: line 389: 827445 Floating point exception(core dumped) $timecmd $cmd < ${tmpin} > ${tmp}
-ERROR! ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' failed
- PDF set = nn23lo1
- alpha_s(Mz)= 0.1300 running at 2 loops.
- alpha_s(Mz)= 0.1300 running at 2 loops.
- Renormalization scale set on event-by-event basis
- Factorization   scale set on event-by-event basis
-
-
- getting user params
-Enter number of events and max and min iterations: 
- Number of events and iterations         8192           1           1
+TEST COMPLETED
diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
index ce6f992dd2..25661e1063 100644
--- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
@@ -2,21 +2,21 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g
 
 
 make USEBUILDDIR=1 BACKEND=cuda
-
 make USEBUILDDIR=1 BACKEND=cppnone
 
+
+make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make USEBUILDDIR=1 BACKEND=cppsse4
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 
-make USEBUILDDIR=1 BACKEND=cpp512y
-
 make USEBUILDDIR=1 BACKEND=cpp512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
@@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2024-06-28_21:32:16
+DATE: 2024-08-08_20:59:06
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0
  [UNWEIGHT] Wrote 404 events (found 1817 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4634s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3913s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0721s for     8192 events => throughput is 1.14E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4768s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4060s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0709s for     8192 events => throughput is 1.16E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0
  [UNWEIGHT] Wrote 404 events (found 1228 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4051s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3318s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0733s for     8192 events => throughput is 1.12E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4179s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3473s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0706s for     8192 events => throughput is 1.16E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=0
  [UNWEIGHT] Wrote 1939 events (found 1944 events)
- [COUNTERS] PROGRAM TOTAL          :    2.3247s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5253s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.7994s for    90112 events => throughput is 1.13E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.3258s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5517s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.7741s for    90112 events => throughput is 1.16E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,10 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2711 [0.27110539350666329] fbridge_mode=1
  [UNWEIGHT] Wrote 404 events (found 1228 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4866s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4090s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0776s for     8192 events => throughput is 1.06E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4207s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3437s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0763s for     8192 events => throughput is 1.07E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +167,10 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2151 [0.21510686560103207] fbridge_mode=1
  [UNWEIGHT] Wrote 1939 events (found 1944 events)
- [COUNTERS] PROGRAM TOTAL          :    2.4492s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6001s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8491s for    90112 events => throughput is 1.06E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.3663s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5373s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8282s for    90112 events => throughput is 1.09E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0007s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -181,12 +183,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.073902e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.091070e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.081313e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.097593e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +211,10 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2711 [0.27110539350666335] fbridge_mode=1
  [UNWEIGHT] Wrote 404 events (found 1228 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4169s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3745s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0424s for     8192 events => throughput is 1.93E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3890s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3472s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0412s for     8192 events => throughput is 1.99E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +245,10 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2151 [0.21510686560103204] fbridge_mode=1
  [UNWEIGHT] Wrote 1939 events (found 1944 events)
- [COUNTERS] PROGRAM TOTAL          :    2.0520s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5829s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4690s for    90112 events => throughput is 1.92E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.9944s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5398s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4540s for    90112 events => throughput is 1.98E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -257,12 +261,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.888912e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.922053e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.907750e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.990970e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +289,10 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2711 [0.27110539330887440] fbridge_mode=1
  [UNWEIGHT] Wrote 404 events (found 1228 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3963s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3710s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0253s for     8192 events => throughput is 3.24E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3734s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3492s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0237s for     8192 events => throughput is 3.46E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +323,10 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2151 [0.21510686557693198] fbridge_mode=1
  [UNWEIGHT] Wrote 1939 events (found 1944 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8273s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5566s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2707s for    90112 events => throughput is 3.33E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8003s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5375s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2622s for    90112 events => throughput is 3.44E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -333,12 +339,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.409995e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.424784e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.381232e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.455227e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +367,10 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2711 [0.27110539330887440] fbridge_mode=1
  [UNWEIGHT] Wrote 404 events (found 1228 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3784s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3565s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0219s for     8192 events => throughput is 3.75E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3680s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3463s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0211s for     8192 events => throughput is 3.88E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +401,10 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2151 [0.21510686557693198] fbridge_mode=1
  [UNWEIGHT] Wrote 1939 events (found 1944 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7902s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5474s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2428s for    90112 events => throughput is 3.71E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7822s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5448s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2369s for    90112 events => throughput is 3.80E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -409,12 +417,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.780633e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.843024e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.863761e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.890496e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -429,7 +437,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192
 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
 --------------------
 Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp'
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
  [OPENMPTH] omp_get_max_threads/nproc = 1/4
  [NGOODHEL] ngoodhel/ncomb = 16/32
  [XSECTION] VECSIZE_USED = 8192
@@ -438,9 +445,10 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2711 [0.27110539330887440] fbridge_mode=1
  [UNWEIGHT] Wrote 404 events (found 1228 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4046s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3701s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0345s for     8192 events => throughput is 2.38E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3872s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3503s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0362s for     8192 events => throughput is 2.26E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0007s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -471,9 +479,10 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2151 [0.21510686557693198] fbridge_mode=1
  [UNWEIGHT] Wrote 1939 events (found 1944 events)
- [COUNTERS] PROGRAM TOTAL          :    1.9365s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5577s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3788s for    90112 events => throughput is 2.38E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.9147s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.5452s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3689s for    90112 events => throughput is 2.44E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -486,12 +495,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.345816e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.300565e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.395812e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.415614e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -514,9 +523,10 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2711 [0.27110539343558537] fbridge_mode=1
  [UNWEIGHT] Wrote 404 events (found 1228 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7702s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7695s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.17E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7684s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7665s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0008s for     8192 events => throughput is 1.09E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0012s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,9 +557,10 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2151 [0.21510686553631395] fbridge_mode=1
  [UNWEIGHT] Wrote 1939 events (found 1944 events)
- [COUNTERS] PROGRAM TOTAL          :    2.0084s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.0002s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0082s for    90112 events => throughput is 1.10E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.9688s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.9599s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0078s for    90112 events => throughput is 1.15E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0012s
 
 *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -562,42 +573,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.433145e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.565914e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.010261e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.104681e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.473761e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.636309e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.584931e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.555697e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.303866e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.642280e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.831028e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.824016e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.325113e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.612307e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.788376e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.778614e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt
index 46804abf09..9204db3db0 100644
--- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt
@@ -2,21 +2,21 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/h
 
 make USEBUILDDIR=1 BACKEND=cuda
 
-make USEBUILDDIR=1 BACKEND=cppnone
 
+make USEBUILDDIR=1 BACKEND=cppnone
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make USEBUILDDIR=1 BACKEND=cppavx2
 
+make USEBUILDDIR=1 BACKEND=cppavx2
 make USEBUILDDIR=1 BACKEND=cpp512y
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
@@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2024-06-29_01:37:41
+DATE: 2024-08-09_00:48:38
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0162955499256161] fbridge_mode=0
  [UNWEIGHT] Wrote 3321 events (found 6423 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8908s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8436s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0472s for     8192 events => throughput is 1.73E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.9141s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8671s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0470s for     8192 events => throughput is 1.74E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0162955499256161] fbridge_mode=0
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4057s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3597s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0460s for     8192 events => throughput is 1.78E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4185s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3716s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0468s for     8192 events => throughput is 1.75E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x10_fortran > /tmp
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.043 [2.0434895240377569] fbridge_mode=0
  [UNWEIGHT] Wrote 1818 events (found 1823 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7090s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.1999s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.5092s for    90112 events => throughput is 1.77E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7982s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2863s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.5119s for    90112 events => throughput is 1.76E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,10 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0162955499256148] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4491s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4002s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0489s for     8192 events => throughput is 1.68E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4199s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3695s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0500s for     8192 events => throughput is 1.64E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +167,10 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.043 [2.0434895240377564] fbridge_mode=1
  [UNWEIGHT] Wrote 1818 events (found 1823 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8072s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2708s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5364s for    90112 events => throughput is 1.68E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8165s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2690s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5470s for    90112 events => throughput is 1.65E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -181,12 +183,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.706986e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.683813e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.699867e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.668738e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +211,10 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0162955499256152] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4065s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3803s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0263s for     8192 events => throughput is 3.12E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4071s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3797s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0270s for     8192 events => throughput is 3.03E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +245,10 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.043 [2.0434895240377564] fbridge_mode=1
  [UNWEIGHT] Wrote 1818 events (found 1823 events)
- [COUNTERS] PROGRAM TOTAL          :    1.5505s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2580s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2925s for    90112 events => throughput is 3.08E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.5672s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2711s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2957s for    90112 events => throughput is 3.05E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -257,12 +261,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.034167e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.037815e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.993777e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.993910e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +289,10 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0162955499256232] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3843s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3683s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0160s for     8192 events => throughput is 5.11E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3883s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3715s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0164s for     8192 events => throughput is 5.00E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +323,10 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.043 [2.0434895240377489] fbridge_mode=1
  [UNWEIGHT] Wrote 1818 events (found 1823 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4149s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2377s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1772s for    90112 events => throughput is 5.09E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4641s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2801s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1835s for    90112 events => throughput is 4.91E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -333,12 +339,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.046299e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.902798e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.149674e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.886099e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +367,10 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0162955499256232] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3859s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3712s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0147s for     8192 events => throughput is 5.58E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3876s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3719s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0152s for     8192 events => throughput is 5.38E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +401,10 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.043 [2.0434895240377489] fbridge_mode=1
  [UNWEIGHT] Wrote 1818 events (found 1823 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3953s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2347s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1605s for    90112 events => throughput is 5.61E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4216s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2567s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1645s for    90112 events => throughput is 5.48E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -409,12 +417,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.314068e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.361206e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.459027e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.494947e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +445,10 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0162955499256152] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4002s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3784s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0218s for     8192 events => throughput is 3.76E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3960s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3733s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0222s for     8192 events => throughput is 3.68E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +479,10 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.043 [2.0434895240377560] fbridge_mode=1
  [UNWEIGHT] Wrote 1818 events (found 1823 events)
- [COUNTERS] PROGRAM TOTAL          :    1.5943s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.3316s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2627s for    90112 events => throughput is 3.43E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.5023s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2627s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2391s for    90112 events => throughput is 3.77E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -485,12 +495,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.445922e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.615246e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.494379e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.662708e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +523,10 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_heftggb
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0162955499256165] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7849s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7843s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.40E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7949s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7934s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.20E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +557,10 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_heftggb
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.043 [2.0434895240377573] fbridge_mode=1
  [UNWEIGHT] Wrote 1818 events (found 1823 events)
- [COUNTERS] PROGRAM TOTAL          :    2.2160s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.2094s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0066s for    90112 events => throughput is 1.36E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7013s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.6935s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0069s for    90112 events => throughput is 1.30E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
 
 *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -561,42 +573,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.826997e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.844829e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.347084e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.285195e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.841221e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.255268e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.717455e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.760215e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.827200e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.235451e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.038168e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.038893e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.844247e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.241445e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.744588e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.725782e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt
index bbce3b7240..ae36851550 100644
--- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt
@@ -1,10 +1,10 @@
 Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
 
-make USEBUILDDIR=1 BACKEND=cuda
-
 
+make USEBUILDDIR=1 BACKEND=cuda
 
 make USEBUILDDIR=1 BACKEND=cppnone
+
 make USEBUILDDIR=1 BACKEND=cppsse4
 make USEBUILDDIR=1 BACKEND=cppavx2
 
@@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2024-06-29_01:38:08
+DATE: 2024-08-09_00:49:04
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0162955499256161] fbridge_mode=0
  [UNWEIGHT] Wrote 3321 events (found 6423 events)
- [COUNTERS] PROGRAM TOTAL          :    0.9065s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8605s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0460s for     8192 events => throughput is 1.78E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.9394s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8922s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0473s for     8192 events => throughput is 1.73E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0162955499256161] fbridge_mode=0
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3960s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3500s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0460s for     8192 events => throughput is 1.78E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4203s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3728s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0475s for     8192 events => throughput is 1.72E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x10_fortran > /tmp
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.043 [2.0434895240377569] fbridge_mode=0
  [UNWEIGHT] Wrote 1818 events (found 1823 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7057s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.1971s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.5086s for    90112 events => throughput is 1.77E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7988s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2854s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.5133s for    90112 events => throughput is 1.76E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,10 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0162897355760356] fbridge_mode=1
  [UNWEIGHT] Wrote 1620 events (found 1625 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4481s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4028s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0453s for     8192 events => throughput is 1.81E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4180s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3713s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0463s for     8192 events => throughput is 1.77E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt
index 2d49c9f52b..d90f539fcf 100644
--- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt
@@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
@@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2024-06-29_01:38:14
+DATE: 2024-08-09_00:49:10
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0162955499256161] fbridge_mode=0
  [UNWEIGHT] Wrote 3321 events (found 6423 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8895s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8431s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0464s for     8192 events => throughput is 1.77E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.9158s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8684s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0474s for     8192 events => throughput is 1.73E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0162955499256161] fbridge_mode=0
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4135s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3638s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0497s for     8192 events => throughput is 1.65E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4209s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3739s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0470s for     8192 events => throughput is 1.74E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x10_fortran > /tmp
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.043 [2.0434895240377569] fbridge_mode=0
  [UNWEIGHT] Wrote 1818 events (found 1823 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7047s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.1950s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.5097s for    90112 events => throughput is 1.77E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8008s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2889s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.5118s for    90112 events => throughput is 1.76E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -134,9 +134,10 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0162955975930954] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4487s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4005s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0482s for     8192 events => throughput is 1.70E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4229s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3736s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0488s for     8192 events => throughput is 1.68E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -168,9 +169,10 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.043 [2.0434895706383660] fbridge_mode=1
  [UNWEIGHT] Wrote 1818 events (found 1823 events)
- [COUNTERS] PROGRAM TOTAL          :    1.8098s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2747s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5351s for    90112 events => throughput is 1.68E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.8077s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2621s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5452s for    90112 events => throughput is 1.65E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -184,13 +186,13 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.605995e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.584312e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.601314e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.572139e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -214,9 +216,10 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0162955975930958] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4069s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3802s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0266s for     8192 events => throughput is 3.08E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4000s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3717s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0278s for     8192 events => throughput is 2.94E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -248,9 +251,10 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.043 [2.0434895706383669] fbridge_mode=1
  [UNWEIGHT] Wrote 1818 events (found 1823 events)
- [COUNTERS] PROGRAM TOTAL          :    1.5495s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2534s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2962s for    90112 events => throughput is 3.04E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6068s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3000s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3063s for    90112 events => throughput is 2.94E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -264,13 +268,13 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.929578e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.801476e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.898682e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.739519e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -294,9 +298,10 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0162955953696393] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3820s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3660s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0160s for     8192 events => throughput is 5.11E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4107s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3912s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0191s for     8192 events => throughput is 4.29E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -328,9 +333,10 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.043 [2.0434895701245432] fbridge_mode=1
  [UNWEIGHT] Wrote 1818 events (found 1823 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4322s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2507s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1815s for    90112 events => throughput is 4.96E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4541s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2695s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1842s for    90112 events => throughput is 4.89E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -344,13 +350,13 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.830829e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.846731e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.788599e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.806331e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -374,9 +380,10 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0162955953696393] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3978s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3830s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0147s for     8192 events => throughput is 5.56E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3903s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3744s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0155s for     8192 events => throughput is 5.29E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -408,9 +415,10 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.043 [2.0434895701245432] fbridge_mode=1
  [UNWEIGHT] Wrote 1818 events (found 1823 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4028s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2374s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1654s for    90112 events => throughput is 5.45E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4306s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2629s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1673s for    90112 events => throughput is 5.39E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -424,13 +432,13 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.250961e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.198253e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.207264e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.334338e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -454,9 +462,10 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0162955953691082] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4003s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3781s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0222s for     8192 events => throughput is 3.69E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4086s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3841s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0240s for     8192 events => throughput is 3.41E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -488,9 +497,10 @@ INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.043 [2.0434895701243878] fbridge_mode=1
  [UNWEIGHT] Wrote 1818 events (found 1823 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4957s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2503s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2454s for    90112 events => throughput is 3.67E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.5232s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2714s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2514s for    90112 events => throughput is 3.58E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -504,13 +514,13 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.177035e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.375382e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.405731e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.300552e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -533,9 +543,10 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_heftggb
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0162955503257827] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7815s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7809s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.39E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7989s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7974s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.20E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -566,9 +577,10 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_heftggb
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.043 [2.0434895242795732] fbridge_mode=1
  [UNWEIGHT] Wrote 1818 events (found 1823 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6630s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6565s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0065s for    90112 events => throughput is 1.38E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6979s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.6904s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0066s for    90112 events => throughput is 1.36E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
 
 *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -581,42 +593,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.830803e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.835154e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.306249e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.144694e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.830251e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.230105e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.737584e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.705062e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.843710e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.235322e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.038788e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.035545e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.827828e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.242431e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.738262e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.754474e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt
index cacd0f35d9..5562e4c07e 100644
--- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt
@@ -3,9 +3,9 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/s
 make USEBUILDDIR=1 BACKEND=cuda
 
 
-make USEBUILDDIR=1 BACKEND=cppsse4
-
 make USEBUILDDIR=1 BACKEND=cppnone
+
+make USEBUILDDIR=1 BACKEND=cppsse4
 make USEBUILDDIR=1 BACKEND=cppavx2
 
 make USEBUILDDIR=1 BACKEND=cpp512y
@@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2024-06-29_01:40:12
+DATE: 2024-08-09_00:52:08
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.99e-07 [7.9896697955084444E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 1 events (found 1041 events)
- [COUNTERS] PROGRAM TOTAL          :    2.6435s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3260s
- [COUNTERS] Fortran MEs      ( 1 ) :    2.3175s for     8192 events => throughput is 3.53E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.5941s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3442s
+ [COUNTERS] Fortran MEs      ( 1 ) :    2.2499s for     8192 events => throughput is 3.64E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.99e-07 [7.9896697955084444E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 215 events (found 963 events)
- [COUNTERS] PROGRAM TOTAL          :    2.5471s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3268s
- [COUNTERS] Fortran MEs      ( 1 ) :    2.2203s for     8192 events => throughput is 3.69E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.6220s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3462s
+ [COUNTERS] Fortran MEs      ( 1 ) :    2.2759s for     8192 events => throughput is 3.60E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x10_fortran > /
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.667e-07 [7.6668083551438230E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 1700 events (found 1705 events)
- [COUNTERS] PROGRAM TOTAL          :   26.2382s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.7627s
- [COUNTERS] Fortran MEs      ( 1 ) :   24.4756s for    90112 events => throughput is 3.68E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   26.7017s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.8086s
+ [COUNTERS] Fortran MEs      ( 1 ) :   24.8931s for    90112 events => throughput is 3.62E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,10 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.99e-07 [7.9896697955084454E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 215 events (found 963 events)
- [COUNTERS] PROGRAM TOTAL          :    5.0170s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.6185s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    2.3985s for     8192 events => throughput is 3.42E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.7821s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3463s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    2.4305s for     8192 events => throughput is 3.37E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0052s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +167,10 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.667e-07 [7.6668083551438187E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 1700 events (found 1705 events)
- [COUNTERS] PROGRAM TOTAL          :   30.4436s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.0366s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   26.4070s for    90112 events => throughput is 3.41E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   28.5017s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7808s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   26.7158s for    90112 events => throughput is 3.37E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0052s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -181,12 +183,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.563022e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.542884e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.568130e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.530103e+03                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +211,10 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.99e-07 [7.9896697955084412E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 215 events (found 963 events)
- [COUNTERS] PROGRAM TOTAL          :    2.8190s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5560s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.2630s for     8192 events => throughput is 6.49E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6103s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3441s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.2634s for     8192 events => throughput is 6.48E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0028s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +245,10 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.667e-07 [7.6668083551438230E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 1700 events (found 1705 events)
- [COUNTERS] PROGRAM TOTAL          :   16.9508s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.9389s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   14.0119s for    90112 events => throughput is 6.43E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   15.9197s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7936s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   14.1234s for    90112 events => throughput is 6.38E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0027s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -257,12 +261,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.403640e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.656588e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.805258e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.664988e+03                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +289,10 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.99e-07 [7.9896697955084454E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 215 events (found 963 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4145s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8628s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5516s for     8192 events => throughput is 1.49E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.9116s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3446s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5653s for     8192 events => throughput is 1.45E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0017s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +323,10 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.667e-07 [7.6668083551438198E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 1700 events (found 1705 events)
- [COUNTERS] PROGRAM TOTAL          :    8.4794s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.2982s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    6.1812s for    90112 events => throughput is 1.46E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    8.0033s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7755s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    6.2261s for    90112 events => throughput is 1.45E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0017s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -333,12 +339,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.512239e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.485686e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.514846e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.488153e+04                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +367,10 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.99e-07 [7.9896697955084454E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 215 events (found 963 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3031s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8025s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5006s for     8192 events => throughput is 1.64E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8483s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3476s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4991s for     8192 events => throughput is 1.64E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0016s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +401,10 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.667e-07 [7.6668083551438198E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 1700 events (found 1705 events)
- [COUNTERS] PROGRAM TOTAL          :    7.6774s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.2371s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    5.4403s for    90112 events => throughput is 1.66E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    7.2914s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7820s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    5.5079s for    90112 events => throughput is 1.64E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0015s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -409,12 +417,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.729653e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.693554e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.736018e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.678028e+04                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +445,10 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.99e-07 [7.9896697955084454E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 215 events (found 963 events)
- [COUNTERS] PROGRAM TOTAL          :    1.5822s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.9519s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.6302s for     8192 events => throughput is 1.30E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.9859s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3430s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.6411s for     8192 events => throughput is 1.28E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0018s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +479,10 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.667e-07 [7.6668083551438198E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 1700 events (found 1705 events)
- [COUNTERS] PROGRAM TOTAL          :    9.3263s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.3416s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    6.9846s for    90112 events => throughput is 1.29E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    8.8930s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7934s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    7.0976s for    90112 events => throughput is 1.27E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0021s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -485,12 +495,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.291281e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.269596e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.310298e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.304260e+04                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +523,10 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftgg
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.99e-07 [7.9896697955084454E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 215 events (found 963 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8138s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7967s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0171s for     8192 events => throughput is 4.79E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8106s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7739s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0172s for     8192 events => throughput is 4.76E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0196s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +557,10 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftgg
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.667e-07 [7.6668083551438198E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 1700 events (found 1705 events)
- [COUNTERS] PROGRAM TOTAL          :    2.3691s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.1805s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1886s for    90112 events => throughput is 4.78E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.4031s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.1951s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1884s for    90112 events => throughput is 4.78E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0195s
 
 *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -561,42 +573,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.833128e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.836004e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.235176e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.223426e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.111711e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.196129e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.408923e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.417377e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.159800e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.149870e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.418265e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.416796e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.120081e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.156718e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.756468e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.752894e+05                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt
index 92432a70ab..e6a1cba79b 100644
--- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt
@@ -1,10 +1,10 @@
 Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
 
-
 make USEBUILDDIR=1 BACKEND=cuda
 
-make USEBUILDDIR=1 BACKEND=cppnone
 
+
+make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
 make USEBUILDDIR=1 BACKEND=cppavx2
 
@@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
@@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2024-06-29_01:42:44
+DATE: 2024-08-09_00:54:32
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.99e-07 [7.9896697955084444E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 1 events (found 1041 events)
- [COUNTERS] PROGRAM TOTAL          :    2.5425s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3274s
- [COUNTERS] Fortran MEs      ( 1 ) :    2.2151s for     8192 events => throughput is 3.70E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.6010s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3425s
+ [COUNTERS] Fortran MEs      ( 1 ) :    2.2584s for     8192 events => throughput is 3.63E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.99e-07 [7.9896697955084444E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 215 events (found 963 events)
- [COUNTERS] PROGRAM TOTAL          :    2.5437s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3275s
- [COUNTERS] Fortran MEs      ( 1 ) :    2.2162s for     8192 events => throughput is 3.70E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.6135s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3438s
+ [COUNTERS] Fortran MEs      ( 1 ) :    2.2696s for     8192 events => throughput is 3.61E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x10_fortran > /
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.667e-07 [7.6668083551438230E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 1700 events (found 1705 events)
- [COUNTERS] PROGRAM TOTAL          :   26.3201s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.7716s
- [COUNTERS] Fortran MEs      ( 1 ) :   24.5485s for    90112 events => throughput is 3.67E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   26.5878s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7922s
+ [COUNTERS] Fortran MEs      ( 1 ) :   24.7956s for    90112 events => throughput is 3.63E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,10 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.99e-07 [7.9896784952157763E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 215 events (found 963 events)
- [COUNTERS] PROGRAM TOTAL          :    4.9557s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.5931s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    2.3626s for     8192 events => throughput is 3.47E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.7487s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3437s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    2.4000s for     8192 events => throughput is 3.41E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0050s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +167,10 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.667e-07 [7.6668138450782073E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 1700 events (found 1705 events)
- [COUNTERS] PROGRAM TOTAL          :   29.8768s
- [COUNTERS] Fortran Overhead ( 0 ) :    3.9900s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   25.8868s for    90112 events => throughput is 3.48E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   28.1446s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7932s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   26.3466s for    90112 events => throughput is 3.42E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0048s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -181,12 +183,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.615815e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.577022e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.588760e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.590866e+03                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +211,10 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.99e-07 [7.9896766542858863E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 215 events (found 963 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7362s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.0215s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.7147s for     8192 events => throughput is 1.15E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.0076s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3437s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.6623s for     8192 events => throughput is 1.24E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0016s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +245,10 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.667e-07 [7.6668121906848987E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 1700 events (found 1705 events)
- [COUNTERS] PROGRAM TOTAL          :    9.5915s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.3718s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    7.2197s for    90112 events => throughput is 1.25E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    9.0575s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7825s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    7.2734s for    90112 events => throughput is 1.24E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0016s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -257,12 +261,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.287471e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.265218e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.287472e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.265996e+04                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +289,10 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.99e-07 [7.9896764408326359E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 215 events (found 963 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8858s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6041s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2817s for     8192 events => throughput is 2.91E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6296s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3461s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2826s for     8192 events => throughput is 2.90E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0010s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +323,10 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.667e-07 [7.6668124799901306E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 1700 events (found 1705 events)
- [COUNTERS] PROGRAM TOTAL          :    5.1146s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.9843s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    3.1303s for    90112 events => throughput is 2.88E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.9000s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7718s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    3.1273s for    90112 events => throughput is 2.88E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0010s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -333,12 +339,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.952012e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.939784e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.978134e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.964350e+04                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +367,10 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.99e-07 [7.9896764408326359E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 215 events (found 963 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8292s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5726s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2566s for     8192 events => throughput is 3.19E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6110s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3506s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2595s for     8192 events => throughput is 3.16E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0009s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +401,10 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.667e-07 [7.6668124799901306E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 1700 events (found 1705 events)
- [COUNTERS] PROGRAM TOTAL          :    4.7825s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.9553s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    2.8272s for    90112 events => throughput is 3.19E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.6623s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7820s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    2.8794s for    90112 events => throughput is 3.13E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0009s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -409,12 +417,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.330203e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.263231e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.291014e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.247254e+04                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +445,10 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.99e-07 [7.9896778056937195E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 215 events (found 963 events)
- [COUNTERS] PROGRAM TOTAL          :    0.9527s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6385s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3142s for     8192 events => throughput is 2.61E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6684s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3460s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3212s for     8192 events => throughput is 2.55E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0013s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +479,10 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.667e-07 [7.6668139178203571E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 1700 events (found 1705 events)
- [COUNTERS] PROGRAM TOTAL          :    5.5476s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.0509s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    3.4967s for    90112 events => throughput is 2.58E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    5.3279s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7717s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    3.5549s for    90112 events => throughput is 2.53E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0013s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -485,12 +495,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.605632e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.589261e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.608873e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.602723e+04                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +523,10 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftgg
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.99e-07 [7.9896802503195373E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 215 events (found 963 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8048s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7896s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0152s for     8192 events => throughput is 5.39E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8100s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7757s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0172s for     8192 events => throughput is 4.77E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0171s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +557,10 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftgg
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.667e-07 [7.6668190930428073E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 1700 events (found 1705 events)
- [COUNTERS] PROGRAM TOTAL          :    2.3385s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.1704s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1681s for    90112 events => throughput is 5.36E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.3814s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.1945s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1700s for    90112 events => throughput is 5.30E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0169s
 
 *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -561,37 +573,37 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.899244e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.860775e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.168943e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.139558e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.330723e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.304686e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.347926e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.344126e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.329833e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.335964e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.345608e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.345203e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.313833e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.314317e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt
index 7abb61d6c6..7e343e91b1 100644
--- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt
@@ -2,10 +2,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/s
 
 
 make USEBUILDDIR=1 BACKEND=cuda
-make USEBUILDDIR=1 BACKEND=cppnone
 
-make USEBUILDDIR=1 BACKEND=cppsse4
 
+make USEBUILDDIR=1 BACKEND=cppnone
+make USEBUILDDIR=1 BACKEND=cppsse4
 make USEBUILDDIR=1 BACKEND=cppavx2
 
 make USEBUILDDIR=1 BACKEND=cpp512y
@@ -13,8 +13,8 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
+make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make USEBUILDDIR=1 BACKEND=cpp512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
@@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2024-06-29_01:44:48
+DATE: 2024-08-09_00:56:30
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.99e-07 [7.9896697955084444E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 1 events (found 1041 events)
- [COUNTERS] PROGRAM TOTAL          :    2.5379s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3249s
- [COUNTERS] Fortran MEs      ( 1 ) :    2.2131s for     8192 events => throughput is 3.70E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.5870s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3434s
+ [COUNTERS] Fortran MEs      ( 1 ) :    2.2435s for     8192 events => throughput is 3.65E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.99e-07 [7.9896697955084444E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 215 events (found 963 events)
- [COUNTERS] PROGRAM TOTAL          :    2.5505s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3304s
- [COUNTERS] Fortran MEs      ( 1 ) :    2.2201s for     8192 events => throughput is 3.69E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.5935s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3429s
+ [COUNTERS] Fortran MEs      ( 1 ) :    2.2507s for     8192 events => throughput is 3.64E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x10_fortran > /
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.667e-07 [7.6668083551438230E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 1700 events (found 1705 events)
- [COUNTERS] PROGRAM TOTAL          :   26.2622s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.7530s
- [COUNTERS] Fortran MEs      ( 1 ) :   24.5092s for    90112 events => throughput is 3.68E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   26.4482s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7926s
+ [COUNTERS] Fortran MEs      ( 1 ) :   24.6556s for    90112 events => throughput is 3.65E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,10 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.99e-07 [7.9896696375074447E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 215 events (found 963 events)
- [COUNTERS] PROGRAM TOTAL          :    5.0598s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.6439s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    2.4159s for     8192 events => throughput is 3.39E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.7899s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3466s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    2.4385s for     8192 events => throughput is 3.36E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0049s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +167,10 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.667e-07 [7.6668081976882373E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 1700 events (found 1705 events)
- [COUNTERS] PROGRAM TOTAL          :   30.6006s
- [COUNTERS] Fortran Overhead ( 0 ) :    4.0492s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   26.5514s for    90112 events => throughput is 3.39E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   28.6799s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7926s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   26.8820s for    90112 events => throughput is 3.35E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0052s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -181,12 +183,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.474454e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.507267e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.439517e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.511786e+03                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +211,10 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.99e-07 [7.9896696285825688E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 215 events (found 963 events)
- [COUNTERS] PROGRAM TOTAL          :    2.7397s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.5249s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.2148s for     8192 events => throughput is 6.74E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.5883s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3421s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.2436s for     8192 events => throughput is 6.59E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0027s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +245,10 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.667e-07 [7.6668081890954375E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 1700 events (found 1705 events)
- [COUNTERS] PROGRAM TOTAL          :   16.3722s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.9270s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   13.4452s for    90112 events => throughput is 6.70E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :   15.4498s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7701s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   13.6770s for    90112 events => throughput is 6.59E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0028s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -257,12 +261,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.040827e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.943689e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.981328e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.925887e+03                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +289,10 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.99e-07 [7.9896696427369838E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 215 events (found 963 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4085s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8620s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5465s for     8192 events => throughput is 1.50E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.9098s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3504s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5576s for     8192 events => throughput is 1.47E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0018s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +323,10 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.667e-07 [7.6668082030339872E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 1700 events (found 1705 events)
- [COUNTERS] PROGRAM TOTAL          :    8.3202s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.2754s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    6.0449s for    90112 events => throughput is 1.49E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    7.9207s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7702s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    6.1490s for    90112 events => throughput is 1.47E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0016s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -333,12 +339,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.536921e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.518105e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.529532e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.514088e+04                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +367,10 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.99e-07 [7.9896696427369838E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 215 events (found 963 events)
- [COUNTERS] PROGRAM TOTAL          :    1.2826s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7966s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4860s for     8192 events => throughput is 1.69E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8334s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3445s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4873s for     8192 events => throughput is 1.68E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0016s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +401,10 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.667e-07 [7.6668082030339872E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 1700 events (found 1705 events)
- [COUNTERS] PROGRAM TOTAL          :    7.5114s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.1898s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    5.3216s for    90112 events => throughput is 1.69E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    7.1725s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7642s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    5.4067s for    90112 events => throughput is 1.67E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0015s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -409,12 +417,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.741888e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.710218e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.760590e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.722202e+04                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +445,10 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.99e-07 [7.9896696427369838E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 215 events (found 963 events)
- [COUNTERS] PROGRAM TOTAL          :    1.5988s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.9602s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.6386s for     8192 events => throughput is 1.28E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.9928s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3430s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.6479s for     8192 events => throughput is 1.26E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0019s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +479,10 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.667e-07 [7.6668082030339872E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 1700 events (found 1705 events)
- [COUNTERS] PROGRAM TOTAL          :    9.4068s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.3459s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    7.0609s for    90112 events => throughput is 1.28E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    9.0659s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7892s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    7.2749s for    90112 events => throughput is 1.24E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0018s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -485,12 +495,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.302056e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.210214e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.302590e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.254889e+04                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +523,10 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftgg
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.99e-07 [7.9896697918297644E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 215 events (found 963 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8079s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7907s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0172s for     8192 events => throughput is 4.77E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8127s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7760s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0173s for     8192 events => throughput is 4.75E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0195s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +557,10 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftgg
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.667e-07 [7.6668083551547592E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 1700 events (found 1705 events)
- [COUNTERS] PROGRAM TOTAL          :    2.5070s
- [COUNTERS] Fortran Overhead ( 0 ) :    2.3171s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1899s for    90112 events => throughput is 4.75E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.4045s
+ [COUNTERS] Fortran Overhead ( 0 ) :    2.1952s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1896s for    90112 events => throughput is 4.75E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0197s
 
 *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -561,42 +573,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.814826e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.814747e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.215347e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.187533e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.080489e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.164029e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.380958e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.389995e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.077544e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.128645e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.380568e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.372948e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.123241e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.119403e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.746541e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.750060e+05                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt
index 6e0ebf0fe6..0fe0851e40 100644
--- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt
@@ -1,8 +1,8 @@
 Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
 
-
 make USEBUILDDIR=1 BACKEND=cuda
 
+
 make USEBUILDDIR=1 BACKEND=cppnone
 
 make USEBUILDDIR=1 BACKEND=cppsse4
@@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2024-07-23_17:06:30
+DATE: 2024-08-09_00:50:54
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3116 [0.31156027201869302] fbridge_mode=0
  [UNWEIGHT] Wrote 1767 events (found 4306 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6347s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6253s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0093s for     8192 events => throughput is 8.79E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6580s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6494s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0085s for     8192 events => throughput is 9.58E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3116 [0.31156027201869302] fbridge_mode=0
  [UNWEIGHT] Wrote 1636 events (found 1641 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3827s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3737s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0089s for     8192 events => throughput is 9.17E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3938s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3851s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0086s for     8192 events => throughput is 9.50E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x10_fortran > /t
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.311 [0.31098556244384407] fbridge_mode=0
  [UNWEIGHT] Wrote 1828 events (found 1833 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3840s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2894s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0946s for    90112 events => throughput is 9.53E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4272s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3345s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0927s for    90112 events => throughput is 9.72E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,10 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3116 [0.31156027201869291] fbridge_mode=1
  [UNWEIGHT] Wrote 1636 events (found 1641 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3976s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3899s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0077s for     8192 events => throughput is 1.06E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3960s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3874s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0082s for     8192 events => throughput is 9.99E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +167,10 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.311 [0.31098556244384418] fbridge_mode=1
  [UNWEIGHT] Wrote 1828 events (found 1833 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3567s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2681s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0886s for    90112 events => throughput is 1.02E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4271s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3353s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0914s for    90112 events => throughput is 9.86E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -181,12 +183,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.005595e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.006217e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.012216e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.022578e+06                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +211,10 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3116 [0.31156027201869291] fbridge_mode=1
  [UNWEIGHT] Wrote 1636 events (found 1641 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3752s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3708s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0044s for     8192 events => throughput is 1.84E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3903s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3856s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0043s for     8192 events => throughput is 1.89E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +245,10 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.311 [0.31098556244384418] fbridge_mode=1
  [UNWEIGHT] Wrote 1828 events (found 1833 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3147s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2675s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0472s for    90112 events => throughput is 1.91E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3937s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3444s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0489s for    90112 events => throughput is 1.84E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -257,12 +261,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.909460e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.897485e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.971415e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.985824e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +289,10 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3116 [0.31156027201869291] fbridge_mode=1
  [UNWEIGHT] Wrote 1636 events (found 1641 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3705s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3679s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0026s for     8192 events => throughput is 3.18E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3921s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3888s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0028s for     8192 events => throughput is 2.88E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +323,10 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.311 [0.31098556244384418] fbridge_mode=1
  [UNWEIGHT] Wrote 1828 events (found 1833 events)
- [COUNTERS] PROGRAM TOTAL          :    1.2978s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2679s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0299s for    90112 events => throughput is 3.01E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3531s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3221s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0306s for    90112 events => throughput is 2.95E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -333,12 +339,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.080442e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.126014e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.074278e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.364824e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +367,10 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3116 [0.31156027201869291] fbridge_mode=1
  [UNWEIGHT] Wrote 1636 events (found 1641 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3716s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3691s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0025s for     8192 events => throughput is 3.23E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3883s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3854s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0026s for     8192 events => throughput is 3.20E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +401,10 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.311 [0.31098556244384418] fbridge_mode=1
  [UNWEIGHT] Wrote 1828 events (found 1833 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3003s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2717s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0286s for    90112 events => throughput is 3.15E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3635s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3336s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0295s for    90112 events => throughput is 3.05E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -409,12 +417,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.360999e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.285096e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.517113e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.423598e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +445,10 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3116 [0.31156027201869291] fbridge_mode=1
  [UNWEIGHT] Wrote 1636 events (found 1641 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3734s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3703s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0030s for     8192 events => throughput is 2.70E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3910s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3874s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.63E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +479,10 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.311 [0.31098556244384418] fbridge_mode=1
  [UNWEIGHT] Wrote 1828 events (found 1833 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3066s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2761s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0304s for    90112 events => throughput is 2.96E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3563s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3235s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0323s for    90112 events => throughput is 2.79E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -485,12 +495,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.892371e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.866364e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.222820e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.134151e+06                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +523,10 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_susyggt
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3116 [0.31156027201869280] fbridge_mode=1
  [UNWEIGHT] Wrote 1636 events (found 1641 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8041s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8036s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.64E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8164s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8152s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.37E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +557,10 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_susyggt
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.311 [0.31098556244384401] fbridge_mode=1
  [UNWEIGHT] Wrote 1828 events (found 1833 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7080s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.7028s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0051s for    90112 events => throughput is 1.75E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7576s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7518s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0052s for    90112 events => throughput is 1.72E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -561,42 +573,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.783715e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.730366e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.058188e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.967481e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.170829e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.198830e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.604074e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.649618e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.147861e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.170218e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.994548e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.903772e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.155905e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.201664e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.319425e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.319844e+08                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt
index 0993fdcc1c..5c4b04cd13 100644
--- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt
@@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2024-07-23_17:06:54
+DATE: 2024-08-09_00:51:19
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3116 [0.31156027201869302] fbridge_mode=0
  [UNWEIGHT] Wrote 1767 events (found 4306 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6621s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6530s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0091s for     8192 events => throughput is 9.01E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6497s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6414s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0083s for     8192 events => throughput is 9.86E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3116 [0.31156027201869302] fbridge_mode=0
  [UNWEIGHT] Wrote 1636 events (found 1641 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3966s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3872s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0094s for     8192 events => throughput is 8.74E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4039s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3951s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0089s for     8192 events => throughput is 9.25E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x10_fortran > /t
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.311 [0.31098556244384407] fbridge_mode=0
  [UNWEIGHT] Wrote 1828 events (found 1833 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3966s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.3008s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0958s for    90112 events => throughput is 9.40E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4878s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3911s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0967s for    90112 events => throughput is 9.32E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,10 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3116 [0.31156021439979276] fbridge_mode=1
  [UNWEIGHT] Wrote 1636 events (found 1641 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3872s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3787s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0086s for     8192 events => throughput is 9.57E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3975s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3887s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0084s for     8192 events => throughput is 9.70E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +167,10 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.311 [0.31098550550786874] fbridge_mode=1
  [UNWEIGHT] Wrote 1828 events (found 1833 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3681s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2789s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0892s for    90112 events => throughput is 1.01E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4264s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3345s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0916s for    90112 events => throughput is 9.84E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -181,12 +183,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.015948e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.034265e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.021671e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.024334e+06                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +211,10 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3116 [0.31156021343761686] fbridge_mode=1
  [UNWEIGHT] Wrote 1636 events (found 1641 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3766s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3741s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0025s for     8192 events => throughput is 3.22E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3905s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3875s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0026s for     8192 events => throughput is 3.09E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +245,10 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.311 [0.31098550488814170] fbridge_mode=1
  [UNWEIGHT] Wrote 1828 events (found 1833 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3081s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2794s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0287s for    90112 events => throughput is 3.14E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3711s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3420s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0289s for    90112 events => throughput is 3.12E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -257,12 +261,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.304389e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.288372e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.380080e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.432097e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +289,10 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3116 [0.31156021516056748] fbridge_mode=1
  [UNWEIGHT] Wrote 1636 events (found 1641 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3710s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3693s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0017s for     8192 events => throughput is 4.90E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3889s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3868s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0018s for     8192 events => throughput is 4.52E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +323,10 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.311 [0.31098550596898289] fbridge_mode=1
  [UNWEIGHT] Wrote 1828 events (found 1833 events)
- [COUNTERS] PROGRAM TOTAL          :    1.2810s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2620s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0191s for    90112 events => throughput is 4.72E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3432s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3229s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0200s for    90112 events => throughput is 4.50E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -333,12 +339,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.883752e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.077269e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.254694e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.403997e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +367,10 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3116 [0.31156021516056748] fbridge_mode=1
  [UNWEIGHT] Wrote 1636 events (found 1641 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3739s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3722s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0017s for     8192 events => throughput is 4.74E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3869s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3848s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0018s for     8192 events => throughput is 4.55E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +401,10 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.311 [0.31098550596898289] fbridge_mode=1
  [UNWEIGHT] Wrote 1828 events (found 1833 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3493s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.3299s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0195s for    90112 events => throughput is 4.63E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3387s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3197s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0187s for    90112 events => throughput is 4.81E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -409,12 +417,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.263107e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.322495e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.596148e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.427973e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +445,10 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3116 [0.31156021917867366] fbridge_mode=1
  [UNWEIGHT] Wrote 1636 events (found 1641 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3804s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3783s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0020s for     8192 events => throughput is 4.02E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3878s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3853s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0022s for     8192 events => throughput is 3.78E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +479,10 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.311 [0.31098551029624061] fbridge_mode=1
  [UNWEIGHT] Wrote 1828 events (found 1833 events)
- [COUNTERS] PROGRAM TOTAL          :    1.2838s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2628s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0210s for    90112 events => throughput is 4.30E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3406s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3185s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0218s for    90112 events => throughput is 4.14E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -485,12 +495,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.410618e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.424607e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.579934e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.888963e+06                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +523,10 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_susyggt
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3116 [0.31156022290359153] fbridge_mode=1
  [UNWEIGHT] Wrote 1636 events (found 1641 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8021s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8016s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.58E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8169s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8154s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.46E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0009s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +557,10 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_susyggt
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.311 [0.31098551341908548] fbridge_mode=1
  [UNWEIGHT] Wrote 1828 events (found 1833 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7042s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6994s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0047s for    90112 events => throughput is 1.90E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7464s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7407s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0049s for    90112 events => throughput is 1.85E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0009s
 
 *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -561,42 +573,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.004673e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.032627e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.352806e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.278657e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.848697e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.543019e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.574226e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.578539e+09                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.876693e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.555176e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.655828e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.658200e+09                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.468075e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.883073e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.699037e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.705532e+08                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt
index f51812e183..62624c2c92 100644
--- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt
@@ -1,9 +1,9 @@
 Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
 
 
-make USEBUILDDIR=1 BACKEND=cuda
 
 
+make USEBUILDDIR=1 BACKEND=cuda
 make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
 make USEBUILDDIR=1 BACKEND=cppavx2
@@ -13,10 +13,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
@@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2024-07-23_17:07:18
+DATE: 2024-08-09_00:51:44
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3116 [0.31156027201869302] fbridge_mode=0
  [UNWEIGHT] Wrote 1767 events (found 4306 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6364s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6273s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0091s for     8192 events => throughput is 8.96E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6493s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6409s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0084s for     8192 events => throughput is 9.81E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3116 [0.31156027201869302] fbridge_mode=0
  [UNWEIGHT] Wrote 1636 events (found 1641 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3980s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3884s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0096s for     8192 events => throughput is 8.50E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3992s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3909s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0083s for     8192 events => throughput is 9.85E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x10_fortran > /t
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.311 [0.31098556244384407] fbridge_mode=0
  [UNWEIGHT] Wrote 1828 events (found 1833 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4071s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.3113s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0958s for    90112 events => throughput is 9.41E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4133s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3208s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0925s for    90112 events => throughput is 9.75E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,10 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3116 [0.31156028014369008] fbridge_mode=1
  [UNWEIGHT] Wrote 1636 events (found 1641 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3870s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3789s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0082s for     8192 events => throughput is 1.00E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3950s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3864s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0082s for     8192 events => throughput is 9.94E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +167,10 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.311 [0.31098557069460298] fbridge_mode=1
  [UNWEIGHT] Wrote 1828 events (found 1833 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3664s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2748s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0916s for    90112 events => throughput is 9.84E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4087s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3177s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0906s for    90112 events => throughput is 9.95E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -181,12 +183,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.803105e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.803386e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.833012e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.910254e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +211,10 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3116 [0.31156028014369008] fbridge_mode=1
  [UNWEIGHT] Wrote 1636 events (found 1641 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3826s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3782s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0044s for     8192 events => throughput is 1.86E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3923s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3874s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0045s for     8192 events => throughput is 1.82E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +245,10 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.311 [0.31098557069460298] fbridge_mode=1
  [UNWEIGHT] Wrote 1828 events (found 1833 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3196s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2730s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0466s for    90112 events => throughput is 1.93E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3653s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3175s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0474s for    90112 events => throughput is 1.90E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -257,12 +261,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.966382e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.964224e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.002002e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.028853e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +289,10 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3116 [0.31156028097537258] fbridge_mode=1
  [UNWEIGHT] Wrote 1636 events (found 1641 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3795s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3765s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0030s for     8192 events => throughput is 2.76E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3954s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3923s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0027s for     8192 events => throughput is 3.03E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +323,10 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.311 [0.31098557141632605] fbridge_mode=1
  [UNWEIGHT] Wrote 1828 events (found 1833 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3017s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2730s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0287s for    90112 events => throughput is 3.14E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3415s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3131s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0280s for    90112 events => throughput is 3.22E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -333,12 +339,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.113986e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.237365e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.514629e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.416021e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +367,10 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3116 [0.31156028097537258] fbridge_mode=1
  [UNWEIGHT] Wrote 1636 events (found 1641 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3847s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3820s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0026s for     8192 events => throughput is 3.12E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3940s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3909s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0027s for     8192 events => throughput is 3.04E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +401,10 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.311 [0.31098557141632605] fbridge_mode=1
  [UNWEIGHT] Wrote 1828 events (found 1833 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3019s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2739s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0280s for    90112 events => throughput is 3.22E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3467s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3184s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0279s for    90112 events => throughput is 3.23E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -409,12 +417,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.470466e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.347126e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.587854e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.589308e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +445,10 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3116 [0.31156028097537258] fbridge_mode=1
  [UNWEIGHT] Wrote 1636 events (found 1641 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3731s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3703s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0028s for     8192 events => throughput is 2.87E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3978s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3942s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.66E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +479,10 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.311 [0.31098557141632605] fbridge_mode=1
  [UNWEIGHT] Wrote 1828 events (found 1833 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3054s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2744s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0310s for    90112 events => throughput is 2.91E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3501s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.3186s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0311s for    90112 events => throughput is 2.90E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -485,12 +495,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.935388e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.904623e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.012886e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.114835e+06                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +523,10 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_susyggt
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3116 [0.31156027194560187] fbridge_mode=1
  [UNWEIGHT] Wrote 1636 events (found 1641 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8059s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8054s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.56E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8152s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8140s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.39E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +557,10 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_susyggt
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.311 [0.31098556243340819] fbridge_mode=1
  [UNWEIGHT] Wrote 1828 events (found 1833 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7003s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6952s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0051s for    90112 events => throughput is 1.78E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7501s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7444s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0051s for    90112 events => throughput is 1.75E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -561,42 +573,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.612145e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.842332e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.945612e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.019027e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.146137e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.214756e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.463844e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.517612e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.146854e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.171297e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.862724e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.740991e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.162271e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.214875e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.282910e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.310258e+08                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt
index 679246dd46..6131633fdd 100644
--- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt
@@ -1,7 +1,7 @@
 Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
 
-make USEBUILDDIR=1 BACKEND=cuda
 
+make USEBUILDDIR=1 BACKEND=cuda
 
 make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
@@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2024-06-29_01:38:40
+DATE: 2024-08-09_00:49:37
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=0
  [UNWEIGHT] Wrote 2620 events (found 5403 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7777s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7362s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0415s for     8192 events => throughput is 1.97E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8016s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7599s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0418s for     8192 events => throughput is 1.96E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=0
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3976s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3565s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0411s for     8192 events => throughput is 1.99E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4173s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3758s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0415s for     8192 events => throughput is 1.97E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x10_fortran > /tmp
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.58 [44.577523870256456] fbridge_mode=0
  [UNWEIGHT] Wrote 1743 events (found 1748 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6909s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2365s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.4544s for    90112 events => throughput is 1.98E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6984s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2478s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.4506s for    90112 events => throughput is 2.00E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,10 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.6 [44.598860065419863] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4458s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4030s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0427s for     8192 events => throughput is 1.92E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4145s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3702s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0438s for     8192 events => throughput is 1.87E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +167,10 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.58 [44.577523870256471] fbridge_mode=1
  [UNWEIGHT] Wrote 1743 events (found 1748 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7824s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.3031s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4793s for    90112 events => throughput is 1.88E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7366s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2536s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4825s for    90112 events => throughput is 1.87E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -181,12 +183,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.924806e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.880754e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.912421e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.882930e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +211,10 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4066s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3820s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0246s for     8192 events => throughput is 3.33E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3960s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3713s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0243s for     8192 events => throughput is 3.37E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +245,10 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.58 [44.577523870256471] fbridge_mode=1
  [UNWEIGHT] Wrote 1743 events (found 1748 events)
- [COUNTERS] PROGRAM TOTAL          :    1.5413s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2750s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2663s for    90112 events => throughput is 3.38E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.5199s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2483s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2711s for    90112 events => throughput is 3.32E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -257,12 +261,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.314362e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.302363e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.440069e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.365112e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +289,10 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3874s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3723s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0150s for     8192 events => throughput is 5.45E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3924s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3765s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0155s for     8192 events => throughput is 5.28E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +323,10 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.58 [44.577523870256485] fbridge_mode=1
  [UNWEIGHT] Wrote 1743 events (found 1748 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4296s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2659s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1637s for    90112 events => throughput is 5.50E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4183s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2503s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1675s for    90112 events => throughput is 5.38E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -333,12 +339,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.062177e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.278183e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.382563e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.374748e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +367,10 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3976s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3833s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0144s for     8192 events => throughput is 5.70E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3894s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3754s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0136s for     8192 events => throughput is 6.02E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +401,10 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.58 [44.577523870256485] fbridge_mode=1
  [UNWEIGHT] Wrote 1743 events (found 1748 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4167s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2662s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1505s for    90112 events => throughput is 5.99E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3978s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2454s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1520s for    90112 events => throughput is 5.93E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -409,12 +417,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.763603e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.775498e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.871042e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.841522e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +445,10 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3996s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3785s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0211s for     8192 events => throughput is 3.89E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4047s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3821s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0222s for     8192 events => throughput is 3.70E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +479,10 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.58 [44.577523870256485] fbridge_mode=1
  [UNWEIGHT] Wrote 1743 events (found 1748 events)
- [COUNTERS] PROGRAM TOTAL          :    1.5325s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2948s
+ [COUNTERS] PROGRAM TOTAL          :    1.4927s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2545s
  [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2377s for    90112 events => throughput is 3.79E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -485,12 +495,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.502625e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.798876e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.556287e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.612840e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +523,10 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_susyggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.6 [44.598860065419849] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8405s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8399s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.41E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8126s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8111s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.24E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +557,10 @@ Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_susyggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.58 [44.577523870256485] fbridge_mode=1
  [UNWEIGHT] Wrote 1743 events (found 1748 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6883s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6820s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0063s for    90112 events => throughput is 1.42E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.6862s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.6788s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0066s for    90112 events => throughput is 1.37E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
 
 *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -561,42 +573,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.919891e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.869432e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.630666e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.714086e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.869333e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.311155e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.082168e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.083882e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.898237e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.322734e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.164202e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.159310e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.905759e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.296675e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.090826e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.098537e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt
index 9e00b5e78a..58b86df658 100644
--- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt
@@ -2,12 +2,12 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/s
 
 make USEBUILDDIR=1 BACKEND=cuda
 
-
-
 make USEBUILDDIR=1 BACKEND=cppnone
+
 make USEBUILDDIR=1 BACKEND=cppsse4
-make USEBUILDDIR=1 BACKEND=cppavx2
 
+
+make USEBUILDDIR=1 BACKEND=cppavx2
 make USEBUILDDIR=1 BACKEND=cpp512y
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
@@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2024-06-29_01:39:07
+DATE: 2024-08-09_00:50:03
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
@@ -58,8 +58,8 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=0
  [UNWEIGHT] Wrote 2620 events (found 5403 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7827s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7412s
+ [COUNTERS] PROGRAM TOTAL          :    0.8051s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7635s
  [COUNTERS] Fortran MEs      ( 1 ) :    0.0416s for     8192 events => throughput is 1.97E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=0
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3949s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3540s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0410s for     8192 events => throughput is 2.00E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4148s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3740s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0408s for     8192 events => throughput is 2.01E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x10_fortran > /tmp
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.58 [44.577523870256456] fbridge_mode=0
  [UNWEIGHT] Wrote 1743 events (found 1748 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6850s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2308s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.4541s for    90112 events => throughput is 1.98E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7188s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2615s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.4573s for    90112 events => throughput is 1.97E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,10 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.6 [44.598853620719339] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4401s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3996s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0405s for     8192 events => throughput is 2.02E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4164s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3751s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0410s for     8192 events => throughput is 2.00E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +167,10 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.58 [44.577522280119403] fbridge_mode=1
  [UNWEIGHT] Wrote 1743 events (found 1748 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7388s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2934s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4454s for    90112 events => throughput is 2.02E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7041s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2499s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4538s for    90112 events => throughput is 1.99E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -181,12 +183,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.042800e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.004528e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.015984e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.989674e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +211,10 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.6 [44.598849697851406] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3877s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3717s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0160s for     8192 events => throughput is 5.11E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3933s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3758s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0172s for     8192 events => throughput is 4.76E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +245,10 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.58 [44.577518590213366] fbridge_mode=1
  [UNWEIGHT] Wrote 1743 events (found 1748 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4390s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2600s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1790s for    90112 events => throughput is 5.03E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4571s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2702s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1866s for    90112 events => throughput is 4.83E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -257,12 +261,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.765299e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.766493e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.747967e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.711541e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +289,10 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.6 [44.598850036412124] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3742s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3654s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0087s for     8192 events => throughput is 9.37E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3932s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3838s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0091s for     8192 events => throughput is 8.99E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +323,10 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.58 [44.577518612400254] fbridge_mode=1
  [UNWEIGHT] Wrote 1743 events (found 1748 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3503s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2561s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0942s for    90112 events => throughput is 9.56E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3456s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2495s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0958s for    90112 events => throughput is 9.40E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -333,12 +339,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.065332e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.204759e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.552613e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.210555e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +367,10 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.6 [44.598850036412124] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3831s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3750s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0082s for     8192 events => throughput is 1.00E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3855s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3769s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0083s for     8192 events => throughput is 9.85E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +401,10 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.58 [44.577518612400254] fbridge_mode=1
  [UNWEIGHT] Wrote 1743 events (found 1748 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3419s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2529s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0890s for    90112 events => throughput is 1.01E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3394s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2483s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0908s for    90112 events => throughput is 9.92E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -409,12 +417,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.815596e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.706656e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.012011e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.233766e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +445,10 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.6 [44.598854350242270] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3801s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3696s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0106s for     8192 events => throughput is 7.74E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3868s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3748s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0116s for     8192 events => throughput is 7.03E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +479,10 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.58 [44.577522751628507] fbridge_mode=1
  [UNWEIGHT] Wrote 1743 events (found 1748 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3921s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2698s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1223s for    90112 events => throughput is 7.37E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.3825s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2565s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1256s for    90112 events => throughput is 7.17E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -485,12 +495,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.916519e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.942843e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.893526e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.910825e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +523,10 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_susyggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.6 [44.598870301426373] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7890s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7885s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0005s for     8192 events => throughput is 1.59E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8091s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8078s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.43E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0007s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -546,9 +557,10 @@ Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_susyggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.58 [44.577527268256027] fbridge_mode=1
  [UNWEIGHT] Wrote 1743 events (found 1748 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6767s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6712s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0055s for    90112 events => throughput is 1.65E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7098s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.7033s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0058s for    90112 events => throughput is 1.56E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0007s
 
 *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -561,42 +573,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.812781e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.705094e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.214634e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.269887e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.018876e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.888199e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.400207e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.391800e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.038902e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.898622e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.501990e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.539526e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.639781e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.473018e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.489136e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.495430e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt
index e096eb78b5..75d0c77429 100644
--- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt
@@ -6,8 +6,8 @@ make USEBUILDDIR=1 BACKEND=cuda
 make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
 
-make USEBUILDDIR=1 BACKEND=cppavx2
 
+make USEBUILDDIR=1 BACKEND=cppavx2
 make USEBUILDDIR=1 BACKEND=cpp512y
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
@@ -18,10 +18,10 @@ make USEBUILDDIR=1 BACKEND=cpp512z
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
@@ -32,7 +32,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/
 
 OMP_NUM_THREADS=
 
-DATE: 2024-06-29_01:39:32
+DATE: 2024-08-09_00:50:28
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
@@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=0
  [UNWEIGHT] Wrote 2620 events (found 5403 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7859s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7445s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0414s for     8192 events => throughput is 1.98E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8208s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7796s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0412s for     8192 events => throughput is 1.99E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=0
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3977s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3560s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0416s for     8192 events => throughput is 1.97E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4160s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3749s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0411s for     8192 events => throughput is 1.99E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) ***
 --------------------
@@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x10_fortran > /tmp
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.58 [44.577523870256456] fbridge_mode=0
  [UNWEIGHT] Wrote 1743 events (found 1748 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6901s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2335s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.4566s for    90112 events => throughput is 1.97E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7104s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2559s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.4544s for    90112 events => throughput is 1.98E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -133,9 +133,10 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.6 [44.598861353577519] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4458s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4017s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0440s for     8192 events => throughput is 1.86E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4204s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3749s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0450s for     8192 events => throughput is 1.82E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -166,9 +167,10 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.58 [44.577525144126803] fbridge_mode=1
  [UNWEIGHT] Wrote 1743 events (found 1748 events)
- [COUNTERS] PROGRAM TOTAL          :    1.7736s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2953s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4783s for    90112 events => throughput is 1.88E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.7448s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2577s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4867s for    90112 events => throughput is 1.85E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -181,12 +183,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.906547e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.873127e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.885100e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.907422e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -209,9 +211,10 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.6 [44.598861353577519] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4069s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3833s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0236s for     8192 events => throughput is 3.48E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3960s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3712s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0244s for     8192 events => throughput is 3.36E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -242,9 +245,10 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.58 [44.577525144126810] fbridge_mode=1
  [UNWEIGHT] Wrote 1743 events (found 1748 events)
- [COUNTERS] PROGRAM TOTAL          :    1.5337s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2711s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2626s for    90112 events => throughput is 3.43E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.5269s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2579s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2686s for    90112 events => throughput is 3.35E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -257,12 +261,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.321004e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.333942e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.457355e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.376975e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -285,9 +289,10 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.6 [44.598861344883289] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3969s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3823s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0147s for     8192 events => throughput is 5.58E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3926s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3769s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0153s for     8192 events => throughput is 5.37E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -318,9 +323,10 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.58 [44.577525178109212] fbridge_mode=1
  [UNWEIGHT] Wrote 1743 events (found 1748 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4292s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2669s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1623s for    90112 events => throughput is 5.55E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4173s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2508s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1662s for    90112 events => throughput is 5.42E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -333,12 +339,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.432194e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.335642e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.311023e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.330908e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -361,9 +367,10 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.6 [44.598861344883289] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.3859s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3721s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0138s for     8192 events => throughput is 5.94E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3897s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3750s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0143s for     8192 events => throughput is 5.74E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -394,9 +401,10 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.58 [44.577525178109212] fbridge_mode=1
  [UNWEIGHT] Wrote 1743 events (found 1748 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4048s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2571s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1477s for    90112 events => throughput is 6.10E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4068s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2528s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1536s for    90112 events => throughput is 5.87E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -409,12 +417,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.922477e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.855366e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.992472e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.947430e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -437,9 +445,10 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.6 [44.598861344883289] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4037s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3836s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0201s for     8192 events => throughput is 4.08E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.3995s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3772s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0219s for     8192 events => throughput is 3.75E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -470,9 +479,10 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.58 [44.577525178109212] fbridge_mode=1
  [UNWEIGHT] Wrote 1743 events (found 1748 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4944s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.2693s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2251s for    90112 events => throughput is 4.00E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.4943s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2580s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2358s for    90112 events => throughput is 3.82E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -485,12 +495,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.827539e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.733262e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.759224e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.702855e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -513,9 +523,10 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_susyggt
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.6 [44.598860056955807] fbridge_mode=1
  [UNWEIGHT] Wrote 1603 events (found 1608 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7908s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7902s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0006s for     8192 events => throughput is 1.43E+07 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8053s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8039s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.21E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -547,8 +558,9 @@ Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_susyggt
  [XSECTION] Cross section = 44.58 [44.577523872560512] fbridge_mode=1
  [UNWEIGHT] Wrote 1743 events (found 1748 events)
  [COUNTERS] PROGRAM TOTAL          :    1.6927s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.6862s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0065s for    90112 events => throughput is 1.38E+07 events/s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.6853s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0066s for    90112 events => throughput is 1.36E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
 
 *** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -561,42 +573,42 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.009355e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.871837e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.608692e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.622666e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.895025e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.299743e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.063290e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.055606e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.880064e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.302003e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.136844e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.140289e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.885288e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.319830e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.014417e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.983678e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tput/gitdifftput.sh b/epochX/cudacpp/tput/gitdifftput.sh
index 541d57ef1e..b2c183afe7 100755
--- a/epochX/cudacpp/tput/gitdifftput.sh
+++ b/epochX/cudacpp/tput/gitdifftput.sh
@@ -27,7 +27,7 @@ exclude2='(Entering|Leaving|Building|HASCURAND|BACKEND|USEBUILDDIR)'
 # Lines (interesting) which may change on different software versions
 exclude3='(Symbols|Avg|Relative|MeanMatrixElemValue)'
 # Lines (uninteresting) which change when missing some tests (no avx512, cuda, hip...)
-exclude4='(runExe|cmpExe|runNcu|SIGMA|Workflow|FP|Internal|OMP|Symbols|PASSED|INFO|WARNING|PROF|\+OK|\-OK|CPU:|===|\.\.\.|\-$|\+$)'
+exclude4='(runTest|runExe|cmpExe|runNcu|SIGMA|Workflow|FP|Internal|OMP|Symbols|PASSED|INFO|WARNING|PROF|\+OK|\-OK|CPU:|===|\.\.\.|\-$|\+$)'
 # Lines (interesting) which show that some tests are missing (no avx512, cuda, hip...)
 exclude5='(Not found|no avx512vl)'
 
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
index acaa25cc7a..ad26491862 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-06-28_20:14:43
+DATE: 2024-08-08_19:47:50
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.992468e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.765141e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.187072e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.598959e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.638501e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.177835e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.671047 sec
+TOTAL       :     0.698559 sec
 INFO: No Floating Point Exceptions have been reported
-     2,683,013,854      cycles                           #    2.970 GHz                    
-     4,099,943,544      instructions                     #    1.53  insn per cycle         
-       0.969012880 seconds time elapsed
+     2,601,897,002      cycles                           #    2.808 GHz                    
+     4,040,507,104      instructions                     #    1.55  insn per cycle         
+       0.999350103 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.073867e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.251694e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.251694e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.054108e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.229313e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.229313e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.258093 sec
+TOTAL       :     6.402837 sec
 INFO: No Floating Point Exceptions have been reported
-    19,045,527,053      cycles                           #    3.041 GHz                    
-    46,105,742,127      instructions                     #    2.42  insn per cycle         
-       6.263804311 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  476) (avx2:    0) (512y:    0) (512z:    0)
+    19,233,855,272      cycles                           #    3.000 GHz                    
+    46,180,507,769      instructions                     #    2.40  insn per cycle         
+       6.412153445 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  463) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.629061e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.125560e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.125560e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.601848e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.093713e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.093713e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.266484 sec
+TOTAL       :     4.363298 sec
 INFO: No Floating Point Exceptions have been reported
-    12,926,063,866      cycles                           #    3.026 GHz                    
-    31,614,643,699      instructions                     #    2.45  insn per cycle         
-       4.272247607 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1676) (avx2:    0) (512y:    0) (512z:    0)
+    13,100,720,322      cycles                           #    2.997 GHz                    
+    31,716,075,564      instructions                     #    2.42  insn per cycle         
+       4.372588931 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1664) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.041568e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.865995e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.865995e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.042973e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.858628e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.858628e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.490041 sec
+TOTAL       :     3.509207 sec
 INFO: No Floating Point Exceptions have been reported
-    10,064,100,406      cycles                           #    2.880 GHz                    
-    19,613,549,872      instructions                     #    1.95  insn per cycle         
-       3.496071587 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1972) (512y:    0) (512z:    0)
+    10,205,028,097      cycles                           #    2.901 GHz                    
+    19,707,283,623      instructions                     #    1.93  insn per cycle         
+       3.518316321 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1946) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.101707e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.962511e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.962511e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.068954e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.924439e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.924439e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.393729 sec
+TOTAL       :     3.473859 sec
 INFO: No Floating Point Exceptions have been reported
-     9,843,950,565      cycles                           #    2.897 GHz                    
-    19,263,531,824      instructions                     #    1.96  insn per cycle         
-       3.399372842 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1713) (512y:  178) (512z:    0)
+    10,004,130,884      cycles                           #    2.873 GHz                    
+    19,357,111,804      instructions                     #    1.93  insn per cycle         
+       3.483068816 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1685) (512y:  178) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.833491e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.443330e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.443330e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.804457e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.421604e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.421604e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.832834 sec
+TOTAL       :     3.921670 sec
 INFO: No Floating Point Exceptions have been reported
-     8,597,411,507      cycles                           #    2.241 GHz                    
-    15,728,652,694      instructions                     #    1.83  insn per cycle         
-       3.838260969 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  900) (512y:  156) (512z: 1257)
+     8,766,336,363      cycles                           #    2.231 GHz                    
+    15,830,799,810      instructions                     #    1.81  insn per cycle         
+       3.930866073 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  880) (512y:  156) (512z: 1257)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt
index 076a1808b0..254ccc5cd6 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-06-28_20:49:18
+DATE: 2024-08-08_20:16:34
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.215489e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.207119e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.207119e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.859786e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.167324e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.167324e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.432396 sec
+TOTAL       :     2.182775 sec
 INFO: No Floating Point Exceptions have been reported
-     7,595,029,577      cycles                           #    2.848 GHz                    
-    13,416,645,348      instructions                     #    1.77  insn per cycle         
-       2.736931171 seconds time elapsed
+     7,222,143,773      cycles                           #    2.974 GHz                    
+    12,988,458,578      instructions                     #    1.80  insn per cycle         
+       2.484589357 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -70,8 +70,10 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -89,20 +91,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.034815e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.199828e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.199828e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.023014e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.186587e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.186587e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.685334 sec
+TOTAL       :     6.792659 sec
 INFO: No Floating Point Exceptions have been reported
-    20,294,656,403      cycles                           #    3.033 GHz                    
-    46,336,697,418      instructions                     #    2.28  insn per cycle         
-       6.692388795 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  476) (avx2:    0) (512y:    0) (512z:    0)
+    20,463,079,955      cycles                           #    3.008 GHz                    
+    46,412,955,093      instructions                     #    2.27  insn per cycle         
+       6.804041518 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  463) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -118,20 +121,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.544003e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.988575e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.988575e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.536442e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.970461e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.970461e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.684901 sec
+TOTAL       :     4.741441 sec
 INFO: No Floating Point Exceptions have been reported
-    14,246,354,655      cycles                           #    3.037 GHz                    
-    32,456,861,259      instructions                     #    2.28  insn per cycle         
-       4.692010345 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1676) (avx2:    0) (512y:    0) (512z:    0)
+    14,332,452,862      cycles                           #    3.016 GHz                    
+    32,573,923,419      instructions                     #    2.27  insn per cycle         
+       4.753137415 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1664) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -147,20 +151,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.922827e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.637793e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.637793e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.834595e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.507335e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.507335e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.896654 sec
+TOTAL       :     4.104610 sec
 INFO: No Floating Point Exceptions have been reported
-    11,461,775,110      cycles                           #    2.937 GHz                    
-    20,973,003,881      instructions                     #    1.83  insn per cycle         
-       3.903908173 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1972) (512y:    0) (512z:    0)
+    11,547,104,567      cycles                           #    2.806 GHz                    
+    21,093,610,719      instructions                     #    1.83  insn per cycle         
+       4.116807687 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1946) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -176,20 +181,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.970040e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.709759e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.709759e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.917747e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.629096e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.629096e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.821641 sec
+TOTAL       :     3.937807 sec
 INFO: No Floating Point Exceptions have been reported
-    11,209,928,511      cycles                           #    2.929 GHz                    
-    20,622,007,360      instructions                     #    1.84  insn per cycle         
-       3.828782847 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1713) (512y:  178) (512z:    0)
+    11,279,300,088      cycles                           #    2.856 GHz                    
+    20,732,054,777      instructions                     #    1.84  insn per cycle         
+       3.949582750 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1685) (512y:  178) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -205,20 +211,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.674013e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.181228e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.181228e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.634373e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.159831e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.159831e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.396074 sec
+TOTAL       :     4.550735 sec
 INFO: No Floating Point Exceptions have been reported
-    10,006,474,405      cycles                           #    2.275 GHz                    
-    16,876,699,682      instructions                     #    1.69  insn per cycle         
-       4.403473618 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  900) (512y:  156) (512z: 1257)
+    10,336,377,696      cycles                           #    2.266 GHz                    
+    17,023,763,380      instructions                     #    1.65  insn per cycle         
+       4.562764893 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  880) (512y:  156) (512z: 1257)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt
index 107d3d6a6a..a17dc8d37a 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-06-28_21:00:12
+DATE: 2024-08-08_20:28:00
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.812093e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.683603e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.123683e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.117423e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.844085e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.131938e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     1.372435 sec
+TOTAL       :     1.358559 sec
 INFO: No Floating Point Exceptions have been reported
-     4,633,412,607      cycles                           #    2.869 GHz                    
-     7,070,218,708      instructions                     #    1.53  insn per cycle         
-       1.671081959 seconds time elapsed
+     4,616,681,568      cycles                           #    2.947 GHz                    
+     7,101,035,160      instructions                     #    1.54  insn per cycle         
+       1.643879361 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.056221e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.231517e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.231517e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.047167e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.219441e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.219441e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     6.732250 sec
+TOTAL       :     6.877625 sec
 INFO: No Floating Point Exceptions have been reported
-    20,212,821,842      cycles                           #    3.004 GHz                    
-    46,213,328,831      instructions                     #    2.29  insn per cycle         
-       6.737705322 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  476) (avx2:    0) (512y:    0) (512z:    0)
+    20,474,853,896      cycles                           #    2.975 GHz                    
+    46,476,031,399      instructions                     #    2.27  insn per cycle         
+       6.883195189 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  463) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.624027e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.119257e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.119257e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.613543e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.104302e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.104302e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     4.643508 sec
+TOTAL       :     4.762997 sec
 INFO: No Floating Point Exceptions have been reported
-    14,061,889,539      cycles                           #    3.025 GHz                    
-    31,617,431,963      instructions                     #    2.25  insn per cycle         
-       4.649295615 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1676) (avx2:    0) (512y:    0) (512z:    0)
+    14,341,567,999      cycles                           #    3.008 GHz                    
+    31,906,796,447      instructions                     #    2.22  insn per cycle         
+       4.768768263 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1664) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.052114e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.869806e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.869806e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.037523e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.848398e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.848398e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     3.828551 sec
+TOTAL       :     3.928675 sec
 INFO: No Floating Point Exceptions have been reported
-    11,213,173,484      cycles                           #    2.925 GHz                    
-    19,515,414,655      instructions                     #    1.74  insn per cycle         
-       3.833932438 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1972) (512y:    0) (512z:    0)
+    11,431,967,131      cycles                           #    2.907 GHz                    
+    19,749,163,356      instructions                     #    1.73  insn per cycle         
+       3.934544865 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1946) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.111125e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.978452e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.978452e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.057561e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.903205e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.903205e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     3.744899 sec
+TOTAL       :     3.914555 sec
 INFO: No Floating Point Exceptions have been reported
-    10,990,201,092      cycles                           #    2.931 GHz                    
-    18,964,501,195      instructions                     #    1.73  insn per cycle         
-       3.750289687 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1713) (512y:  178) (512z:    0)
+    11,301,789,336      cycles                           #    2.884 GHz                    
+    19,198,978,685      instructions                     #    1.70  insn per cycle         
+       3.919932247 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1685) (512y:  178) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.836834e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.450918e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.450918e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.792077e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.384424e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.384424e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     4.193324 sec
+TOTAL       :     4.355139 sec
 INFO: No Floating Point Exceptions have been reported
-     9,768,736,605      cycles                           #    2.327 GHz                    
-    15,431,475,618      instructions                     #    1.58  insn per cycle         
-       4.198624459 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  900) (512y:  156) (512z: 1257)
+     9,975,675,333      cycles                           #    2.288 GHz                    
+    15,643,574,075      instructions                     #    1.57  insn per cycle         
+       4.360684158 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  880) (512y:  156) (512z: 1257)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt
index 6fb775969f..02f69b4d1c 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-06-28_20:57:32
+DATE: 2024-08-08_20:25:18
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.817933e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.722429e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.176386e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.161167e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.790408e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.166295e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.968188 sec
+TOTAL       :     0.968841 sec
 INFO: No Floating Point Exceptions have been reported
-     3,589,395,127      cycles                           #    2.997 GHz                    
-     7,175,672,002      instructions                     #    2.00  insn per cycle         
-       1.254215961 seconds time elapsed
+     3,539,663,050      cycles                           #    2.958 GHz                    
+     6,992,486,553      instructions                     #    1.98  insn per cycle         
+       1.255291189 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.079632e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.258365e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.258365e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.054864e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.230420e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.230420e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.223806 sec
+TOTAL       :     6.368076 sec
 INFO: No Floating Point Exceptions have been reported
-    19,058,925,167      cycles                           #    3.060 GHz                    
-    46,108,548,764      instructions                     #    2.42  insn per cycle         
-       6.229330377 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  476) (avx2:    0) (512y:    0) (512z:    0)
+    19,096,334,706      cycles                           #    2.997 GHz                    
+    46,076,716,123      instructions                     #    2.41  insn per cycle         
+       6.373662191 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  463) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.601349e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.088576e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.088576e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.601324e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.083048e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.083048e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.339936 sec
+TOTAL       :     4.335443 sec
 INFO: No Floating Point Exceptions have been reported
-    12,970,405,515      cycles                           #    2.986 GHz                    
-    31,616,392,768      instructions                     #    2.44  insn per cycle         
-       4.345471916 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1676) (avx2:    0) (512y:    0) (512z:    0)
+    12,960,942,150      cycles                           #    2.986 GHz                    
+    31,610,247,350      instructions                     #    2.44  insn per cycle         
+       4.340962885 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1664) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.072548e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.896235e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.896235e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.037265e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.842019e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.842019e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.433513 sec
+TOTAL       :     3.487108 sec
 INFO: No Floating Point Exceptions have been reported
-    10,074,347,286      cycles                           #    2.931 GHz                    
-    19,614,838,060      instructions                     #    1.95  insn per cycle         
-       3.438916068 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1972) (512y:    0) (512z:    0)
+    10,064,000,379      cycles                           #    2.882 GHz                    
+    19,599,635,012      instructions                     #    1.95  insn per cycle         
+       3.492608891 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1946) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.120381e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.995544e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.995544e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.083703e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.929723e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.929723e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.362383 sec
+TOTAL       :     3.417011 sec
 INFO: No Floating Point Exceptions have been reported
-     9,846,721,407      cycles                           #    2.924 GHz                    
-    19,261,667,804      instructions                     #    1.96  insn per cycle         
-       3.367866307 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1713) (512y:  178) (512z:    0)
+     9,860,886,386      cycles                           #    2.882 GHz                    
+    19,261,098,945      instructions                     #    1.95  insn per cycle         
+       3.422241820 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1685) (512y:  178) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.838612e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.456006e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.456006e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.806629e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.401308e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.401308e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.825489 sec
+TOTAL       :     3.881256 sec
 INFO: No Floating Point Exceptions have been reported
-     8,628,909,528      cycles                           #    2.253 GHz                    
-    15,727,742,566      instructions                     #    1.82  insn per cycle         
-       3.831075373 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  900) (512y:  156) (512z: 1257)
+     8,602,524,027      cycles                           #    2.214 GHz                    
+    15,722,205,670      instructions                     #    1.83  insn per cycle         
+       3.886723200 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  880) (512y:  156) (512z: 1257)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt
index bdcfd1fbcb..35f9b1d01f 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-06-28_20:54:51
+DATE: 2024-08-08_20:22:33
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,22 +50,24 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.161971e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.654112e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.070165e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.201911e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.800503e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.039847e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     1.855886 sec
+TOTAL       :     1.856881 sec
 INFO: No Floating Point Exceptions have been reported
-     6,276,149,380      cycles                           #    3.000 GHz                    
-    11,476,884,450      instructions                     #    1.83  insn per cycle         
-       2.148848830 seconds time elapsed
+     6,224,640,386      cycles                           #    2.971 GHz                    
+    11,427,865,713      instructions                     #    1.84  insn per cycle         
+       2.153600888 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -82,20 +84,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.066433e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.245773e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.245773e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.044821e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.217145e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.217145e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.298786 sec
+TOTAL       :     6.426882 sec
 INFO: No Floating Point Exceptions have been reported
-    19,098,852,731      cycles                           #    3.030 GHz                    
-    46,107,987,765      instructions                     #    2.41  insn per cycle         
-       6.304281104 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  476) (avx2:    0) (512y:    0) (512z:    0)
+    19,111,682,358      cycles                           #    2.975 GHz                    
+    46,077,003,649      instructions                     #    2.41  insn per cycle         
+       6.432401292 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  463) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -110,20 +113,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.595423e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.081086e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.081086e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.618749e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.109823e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.109823e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.355941 sec
+TOTAL       :     4.289727 sec
 INFO: No Floating Point Exceptions have been reported
-    12,963,236,746      cycles                           #    2.973 GHz                    
-    31,615,561,135      instructions                     #    2.44  insn per cycle         
-       4.361261638 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1676) (avx2:    0) (512y:    0) (512z:    0)
+    12,954,885,068      cycles                           #    3.017 GHz                    
+    31,610,318,935      instructions                     #    2.44  insn per cycle         
+       4.295110036 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1664) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -138,20 +142,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.008621e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.798575e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.798575e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.027068e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.831891e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.831891e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.540183 sec
+TOTAL       :     3.501218 sec
 INFO: No Floating Point Exceptions have been reported
-    10,084,623,628      cycles                           #    2.845 GHz                    
-    19,614,025,069      instructions                     #    1.94  insn per cycle         
-       3.545761412 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1972) (512y:    0) (512z:    0)
+    10,084,953,651      cycles                           #    2.877 GHz                    
+    19,599,538,271      instructions                     #    1.94  insn per cycle         
+       3.506570863 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1946) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -166,20 +171,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.116739e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.991229e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.991229e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.095436e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.953376e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.953376e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.368422 sec
+TOTAL       :     3.399978 sec
 INFO: No Floating Point Exceptions have been reported
-     9,858,689,282      cycles                           #    2.923 GHz                    
-    19,261,795,866      instructions                     #    1.95  insn per cycle         
-       3.373773154 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1713) (512y:  178) (512z:    0)
+     9,825,140,072      cycles                           #    2.886 GHz                    
+    19,248,188,821      instructions                     #    1.96  insn per cycle         
+       3.405318176 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1685) (512y:  178) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -194,20 +200,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.845230e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.462099e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.462099e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.764156e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.337626e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.337626e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.809187 sec
+TOTAL       :     3.973951 sec
 INFO: No Floating Point Exceptions have been reported
-     8,619,862,123      cycles                           #    2.260 GHz                    
-    15,729,443,486      instructions                     #    1.82  insn per cycle         
-       3.814723478 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  900) (512y:  156) (512z: 1257)
+     8,632,225,098      cycles                           #    2.170 GHz                    
+    15,724,542,893      instructions                     #    1.82  insn per cycle         
+       3.979226146 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  880) (512y:  156) (512z: 1257)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt
index a7f2977f4a..30013486b3 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-06-28_20:15:13
+DATE: 2024-08-08_19:48:21
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.219099e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.898314e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.236850e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.631857e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.952875e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.229430e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.660344 sec
+TOTAL       :     0.661237 sec
 INFO: No Floating Point Exceptions have been reported
-     2,665,657,651      cycles                           #    2.984 GHz                    
-     4,166,061,268      instructions                     #    1.56  insn per cycle         
-       0.952670294 seconds time elapsed
+     2,635,614,506      cycles                           #    2.952 GHz                    
+     4,105,447,914      instructions                     #    1.56  insn per cycle         
+       0.952322039 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.068718e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.244728e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.244728e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.051765e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.227570e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.227570e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.286281 sec
+TOTAL       :     6.414969 sec
 INFO: No Floating Point Exceptions have been reported
-    19,185,891,095      cycles                           #    3.050 GHz                    
-    46,066,815,570      instructions                     #    2.40  insn per cycle         
-       6.291973339 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  465) (avx2:    0) (512y:    0) (512z:    0)
+    19,212,287,097      cycles                           #    2.991 GHz                    
+    46,135,858,785      instructions                     #    2.40  insn per cycle         
+       6.423899634 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  452) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.628951e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.130065e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.130065e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.601077e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.094081e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.094081e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.270966 sec
+TOTAL       :     4.367872 sec
 INFO: No Floating Point Exceptions have been reported
-    12,944,360,063      cycles                           #    3.028 GHz                    
-    31,588,001,932      instructions                     #    2.44  insn per cycle         
-       4.276321086 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1662) (avx2:    0) (512y:    0) (512z:    0)
+    13,124,994,280      cycles                           #    3.000 GHz                    
+    31,690,002,602      instructions                     #    2.41  insn per cycle         
+       4.377128729 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1650) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.045031e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.858625e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.858625e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.022628e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.826530e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.826530e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.478567 sec
+TOTAL       :     3.545071 sec
 INFO: No Floating Point Exceptions have been reported
-    10,055,767,156      cycles                           #    2.887 GHz                    
-    19,593,455,838      instructions                     #    1.95  insn per cycle         
-       3.484108783 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1955) (512y:    0) (512z:    0)
+    10,210,134,759      cycles                           #    2.873 GHz                    
+    19,686,352,650      instructions                     #    1.93  insn per cycle         
+       3.554081422 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1929) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.065932e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.899247e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.899247e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.045349e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.884198e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.884198e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.447809 sec
+TOTAL       :     3.513122 sec
 INFO: No Floating Point Exceptions have been reported
-     9,836,323,224      cycles                           #    2.849 GHz                    
-    19,277,262,833      instructions                     #    1.96  insn per cycle         
-       3.453628682 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1698) (512y:  178) (512z:    0)
+    10,000,248,812      cycles                           #    2.840 GHz                    
+    19,370,551,089      instructions                     #    1.94  insn per cycle         
+       3.521931882 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1670) (512y:  178) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.866203e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.502282e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.502282e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.856445e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.503167e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.503167e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.770476 sec
+TOTAL       :     3.821454 sec
 INFO: No Floating Point Exceptions have been reported
-     8,464,276,197      cycles                           #    2.242 GHz                    
-    15,597,917,914      instructions                     #    1.84  insn per cycle         
-       3.775924795 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  886) (512y:  156) (512z: 1237)
+     8,619,394,582      cycles                           #    2.251 GHz                    
+    15,699,269,615      instructions                     #    1.82  insn per cycle         
+       3.830496732 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  866) (512y:  156) (512z: 1237)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt
index 8eeaf4b96f..012009e54a 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-06-28_20:40:08
+DATE: 2024-08-08_20:07:19
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.511337e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.610262e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.162427e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.604046e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.930880e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.176471e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.682277 sec
+TOTAL       :     0.659931 sec
 INFO: No Floating Point Exceptions have been reported
-     2,714,625,205      cycles                           #    2.961 GHz                    
-     4,224,591,836      instructions                     #    1.56  insn per cycle         
-       0.976598730 seconds time elapsed
+     2,627,383,079      cycles                           #    2.945 GHz                    
+     4,093,880,816      instructions                     #    1.56  insn per cycle         
+       0.951439392 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.674677e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.150026e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.150026e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.646087e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.119341e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.119341e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.159141 sec
+TOTAL       :     4.251791 sec
 INFO: No Floating Point Exceptions have been reported
-    12,686,441,916      cycles                           #    3.047 GHz                    
-    32,456,476,690      instructions                     #    2.56  insn per cycle         
-       4.164732578 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  294) (avx2:    0) (512y:    0) (512z:    0)
+    12,834,346,286      cycles                           #    3.012 GHz                    
+    32,589,275,830      instructions                     #    2.54  insn per cycle         
+       4.261338656 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  281) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.114486e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.024182e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.024182e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.060473e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.955935e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.955935e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.379506 sec
+TOTAL       :     3.488809 sec
 INFO: No Floating Point Exceptions have been reported
-    10,290,063,880      cycles                           #    3.041 GHz                    
-    24,598,812,176      instructions                     #    2.39  insn per cycle         
-       3.384861278 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1263) (avx2:    0) (512y:    0) (512z:    0)
+    10,533,405,751      cycles                           #    3.012 GHz                    
+    24,716,100,998      instructions                     #    2.35  insn per cycle         
+       3.498417147 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1251) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.309413e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.377046e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.377046e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.261794e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.343751e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.343751e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.121396 sec
+TOTAL       :     3.211208 sec
 INFO: No Floating Point Exceptions have been reported
-     9,147,892,087      cycles                           #    2.927 GHz                    
-    16,921,916,652      instructions                     #    1.85  insn per cycle         
-       3.126771565 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1630) (512y:    0) (512z:    0)
+     9,296,707,178      cycles                           #    2.887 GHz                    
+    17,025,233,631      instructions                     #    1.83  insn per cycle         
+       3.220709148 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1608) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.371372e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.510492e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.510492e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.333155e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.462746e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.462746e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.049635 sec
+TOTAL       :     3.127002 sec
 INFO: No Floating Point Exceptions have been reported
-     8,903,759,047      cycles                           #    2.915 GHz                    
-    16,334,428,688      instructions                     #    1.83  insn per cycle         
-       3.055129338 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1370) (512y:  139) (512z:    0)
+     9,070,042,536      cycles                           #    2.893 GHz                    
+    16,440,168,447      instructions                     #    1.81  insn per cycle         
+       3.136632933 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1344) (512y:  139) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.046677e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.842152e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.842152e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.025516e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.816401e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.816401e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.475555 sec
+TOTAL       :     3.537864 sec
 INFO: No Floating Point Exceptions have been reported
-     7,902,906,453      cycles                           #    2.271 GHz                    
-    14,570,187,590      instructions                     #    1.84  insn per cycle         
-       3.480874373 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1012) (512y:  158) (512z:  954)
+     8,060,468,675      cycles                           #    2.273 GHz                    
+    14,674,271,295      instructions                     #    1.82  insn per cycle         
+       3.547452410 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  990) (512y:  158) (512z:  954)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt
index bed7cd9b36..6698342434 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-06-28_20:40:34
+DATE: 2024-08-08_20:07:46
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.514355e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.633683e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.214359e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.562157e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.979811e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.228825e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.680593 sec
+TOTAL       :     0.660029 sec
 INFO: No Floating Point Exceptions have been reported
-     2,706,915,647      cycles                           #    2.956 GHz                    
-     4,190,146,936      instructions                     #    1.55  insn per cycle         
-       0.976111914 seconds time elapsed
+     2,629,191,587      cycles                           #    2.942 GHz                    
+     4,053,968,750      instructions                     #    1.54  insn per cycle         
+       0.953306046 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.165926e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.047190e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.047190e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.156529e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.042455e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.042455e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.302787 sec
+TOTAL       :     3.343977 sec
 INFO: No Floating Point Exceptions have been reported
-    10,019,502,432      cycles                           #    3.029 GHz                    
-    25,412,419,372      instructions                     #    2.54  insn per cycle         
-       3.308288779 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  249) (avx2:    0) (512y:    0) (512z:    0)
+    10,082,768,824      cycles                           #    3.008 GHz                    
+    25,523,612,333      instructions                     #    2.53  insn per cycle         
+       3.352820230 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  236) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.445937e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.787045e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.787045e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.385757e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.677774e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.677774e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.976079 sec
+TOTAL       :     3.073965 sec
 INFO: No Floating Point Exceptions have been reported
-     9,030,679,969      cycles                           #    3.030 GHz                    
-    21,406,976,056      instructions                     #    2.37  insn per cycle         
-       2.981503418 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1112) (avx2:    0) (512y:    0) (512z:    0)
+     9,151,066,373      cycles                           #    2.969 GHz                    
+    21,519,389,474      instructions                     #    2.35  insn per cycle         
+       3.083295145 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1100) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.454857e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.711643e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.711643e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.361878e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.558423e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.558423e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.961656 sec
+TOTAL       :     3.100634 sec
 INFO: No Floating Point Exceptions have been reported
-     8,673,614,407      cycles                           #    2.924 GHz                    
-    15,870,802,072      instructions                     #    1.83  insn per cycle         
-       2.967121740 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1503) (512y:    0) (512z:    0)
+     8,837,735,013      cycles                           #    2.843 GHz                    
+    15,972,170,074      instructions                     #    1.81  insn per cycle         
+       3.110024553 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1481) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.517492e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.840768e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.840768e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.456785e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.751546e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.751546e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.896325 sec
+TOTAL       :     2.990911 sec
 INFO: No Floating Point Exceptions have been reported
-     8,488,467,643      cycles                           #    2.926 GHz                    
-    15,590,785,507      instructions                     #    1.84  insn per cycle         
-       2.901737452 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1282) (512y:  141) (512z:    0)
+     8,652,752,906      cycles                           #    2.885 GHz                    
+    15,679,245,875      instructions                     #    1.81  insn per cycle         
+       3.000632003 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1256) (512y:  141) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.182346e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.108553e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.108553e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.146098e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.052577e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.052577e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.281506 sec
+TOTAL       :     3.361559 sec
 INFO: No Floating Point Exceptions have been reported
-     7,603,569,525      cycles                           #    2.314 GHz                    
-    14,280,278,230      instructions                     #    1.88  insn per cycle         
-       3.286945297 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1041) (512y:  164) (512z:  876)
+     7,684,713,240      cycles                           #    2.281 GHz                    
+    14,381,480,169      instructions                     #    1.87  insn per cycle         
+       3.370756572 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1019) (512y:  164) (512z:  876)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
index 7cc0b5502b..7cb0226a73 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-06-28_20:15:43
+DATE: 2024-08-08_19:48:51
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.368793e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.204395e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.149884e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.527020e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.262134e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.154425e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371687e-02 +- 3.270220e-06 )  GeV^0
-TOTAL       :     0.572378 sec
+TOTAL       :     0.568660 sec
 INFO: No Floating Point Exceptions have been reported
-     2,355,843,554      cycles                           #    2.958 GHz                    
-     3,660,983,048      instructions                     #    1.55  insn per cycle         
-       0.854414572 seconds time elapsed
+     2,313,614,099      cycles                           #    2.926 GHz                    
+     3,562,444,599      instructions                     #    1.54  insn per cycle         
+       0.849201094 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 121
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.112502e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.313461e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.313461e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.093483e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.290231e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.290231e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     6.013760 sec
+TOTAL       :     6.128520 sec
 INFO: No Floating Point Exceptions have been reported
-    18,243,635,947      cycles                           #    3.031 GHz                    
-    45,004,183,150      instructions                     #    2.47  insn per cycle         
-       6.018991847 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  424) (avx2:    0) (512y:    0) (512z:    0)
+    18,358,884,229      cycles                           #    2.993 GHz                    
+    45,043,610,227      instructions                     #    2.45  insn per cycle         
+       6.135113438 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  411) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.304371e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.534496e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.534496e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.301890e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.520762e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.520762e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.091758 sec
+TOTAL       :     3.110805 sec
 INFO: No Floating Point Exceptions have been reported
-     9,346,891,298      cycles                           #    3.019 GHz                    
-    22,293,361,334      instructions                     #    2.39  insn per cycle         
-       3.097066950 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1969) (avx2:    0) (512y:    0) (512z:    0)
+     9,366,787,669      cycles                           #    3.005 GHz                    
+    22,330,309,821      instructions                     #    2.38  insn per cycle         
+       3.117673303 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1957) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.502796e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.831762e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.831762e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.473210e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.807312e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.807312e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.873268 sec
+TOTAL       :     2.917892 sec
 INFO: No Floating Point Exceptions have been reported
-     8,375,296,851      cycles                           #    2.910 GHz                    
-    15,755,495,593      instructions                     #    1.88  insn per cycle         
-       2.878533702 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2608) (512y:    0) (512z:    0)
+     8,504,359,827      cycles                           #    2.909 GHz                    
+    15,788,659,527      instructions                     #    1.86  insn per cycle         
+       2.924742872 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2595) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.546067e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.922691e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.922691e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.503770e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.901448e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.901448e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.827933 sec
+TOTAL       :     2.886577 sec
 INFO: No Floating Point Exceptions have been reported
-     8,252,760,214      cycles                           #    2.914 GHz                    
-    15,614,570,874      instructions                     #    1.89  insn per cycle         
-       2.833212419 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2516) (512y:   12) (512z:    0)
+     8,412,391,431      cycles                           #    2.908 GHz                    
+    15,643,654,257      instructions                     #    1.86  insn per cycle         
+       2.893387724 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2500) (512y:   12) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.539602e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.940166e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.940166e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.563180e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.953888e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.953888e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.842362 sec
+TOTAL       :     2.828437 sec
 INFO: No Floating Point Exceptions have been reported
-     6,678,672,318      cycles                           #    2.346 GHz                    
-    12,862,803,929      instructions                     #    1.93  insn per cycle         
-       2.847578540 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1736) (512y:   17) (512z: 1439)
+     6,692,094,866      cycles                           #    2.362 GHz                    
+    12,901,049,888      instructions                     #    1.93  insn per cycle         
+       2.834887138 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1728) (512y:   17) (512z: 1439)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt
index 7685e81166..e0350b6b37 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-06-28_20:49:52
+DATE: 2024-08-08_20:17:08
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.311756e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.058142e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.058142e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.473571e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.655207e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.655207e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371710e-02 +- 3.270389e-06 )  GeV^0
-TOTAL       :     1.675029 sec
+TOTAL       :     1.648294 sec
 INFO: No Floating Point Exceptions have been reported
-     5,729,315,825      cycles                           #    2.994 GHz                    
-    10,322,159,846      instructions                     #    1.80  insn per cycle         
-       1.971186526 seconds time elapsed
+     5,601,516,010      cycles                           #    2.985 GHz                    
+    10,167,612,404      instructions                     #    1.82  insn per cycle         
+       1.933877739 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -70,8 +70,10 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 121
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -89,20 +91,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.083960e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.275861e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.275861e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.085388e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.276616e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.276616e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     6.271203 sec
+TOTAL       :     6.267894 sec
 INFO: No Floating Point Exceptions have been reported
-    18,953,360,500      cycles                           #    3.020 GHz                    
-    45,156,793,477      instructions                     #    2.38  insn per cycle         
-       6.277920644 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  424) (avx2:    0) (512y:    0) (512z:    0)
+    18,908,429,443      cycles                           #    3.015 GHz                    
+    45,146,579,440      instructions                     #    2.39  insn per cycle         
+       6.274110345 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  411) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -118,20 +121,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.201354e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.285456e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.285456e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.203296e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.287244e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.287244e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.357849 sec
+TOTAL       :     3.346534 sec
 INFO: No Floating Point Exceptions have been reported
-    10,064,004,997      cycles                           #    2.992 GHz                    
-    23,628,789,623      instructions                     #    2.35  insn per cycle         
-       3.364700820 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1969) (avx2:    0) (512y:    0) (512z:    0)
+    10,054,217,163      cycles                           #    3.000 GHz                    
+    23,624,196,038      instructions                     #    2.35  insn per cycle         
+       3.352720761 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1957) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -147,20 +151,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.368809e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.551932e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.551932e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.355349e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.546206e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.546206e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     3.146808 sec
+TOTAL       :     3.162857 sec
 INFO: No Floating Point Exceptions have been reported
-     9,223,382,458      cycles                           #    2.927 GHz                    
-    16,875,617,789      instructions                     #    1.83  insn per cycle         
-       3.153415409 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2608) (512y:    0) (512z:    0)
+     9,188,398,792      cycles                           #    2.900 GHz                    
+    16,865,170,162      instructions                     #    1.84  insn per cycle         
+       3.169069798 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2595) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -176,20 +181,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.410709e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.657033e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.657033e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.385264e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.627916e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.627916e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     3.101188 sec
+TOTAL       :     3.125444 sec
 INFO: No Floating Point Exceptions have been reported
-     9,095,421,618      cycles                           #    2.931 GHz                    
-    16,730,461,761      instructions                     #    1.84  insn per cycle         
-       3.108136976 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2516) (512y:   12) (512z:    0)
+     9,070,498,443      cycles                           #    2.897 GHz                    
+    16,723,535,304      instructions                     #    1.84  insn per cycle         
+       3.131626525 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2500) (512y:   12) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -205,20 +211,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.442658e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.667781e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.667781e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.403637e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.591618e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.591618e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     3.065007 sec
+TOTAL       :     3.114765 sec
 INFO: No Floating Point Exceptions have been reported
-     7,435,897,800      cycles                           #    2.422 GHz                    
-    14,069,390,876      instructions                     #    1.89  insn per cycle         
-       3.071736369 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1736) (512y:   17) (512z: 1439)
+     7,403,928,752      cycles                           #    2.373 GHz                    
+    14,061,923,411      instructions                     #    1.90  insn per cycle         
+       3.121062730 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1728) (512y:   17) (512z: 1439)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt
index e48bdabd24..134d5790db 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-06-28_21:00:44
+DATE: 2024-08-08_20:28:33
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.330774e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.178373e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.122775e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.369933e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.192240e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.130758e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371863e-02 +- 3.269951e-06 )  GeV^0
-TOTAL       :     1.177048 sec
+TOTAL       :     1.177651 sec
 INFO: No Floating Point Exceptions have been reported
-     4,161,772,250      cycles                           #    2.977 GHz                    
-     6,642,618,775      instructions                     #    1.60  insn per cycle         
-       1.455092421 seconds time elapsed
+     4,159,647,361      cycles                           #    2.974 GHz                    
+     6,655,919,197      instructions                     #    1.60  insn per cycle         
+       1.454885517 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 121
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.114730e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.316689e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.316689e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.106596e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.306356e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.306356e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371887e-02 +- 3.270267e-06 )  GeV^0
-TOTAL       :     6.329097 sec
+TOTAL       :     6.378232 sec
 INFO: No Floating Point Exceptions have been reported
-    19,286,025,200      cycles                           #    3.045 GHz                    
-    45,186,488,992      instructions                     #    2.34  insn per cycle         
-       6.334387640 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  424) (avx2:    0) (512y:    0) (512z:    0)
+    19,274,317,116      cycles                           #    3.020 GHz                    
+    45,182,791,116      instructions                     #    2.34  insn per cycle         
+       6.383426426 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  411) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.315547e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.562028e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.562028e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.314732e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.536945e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.536945e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371887e-02 +- 3.270266e-06 )  GeV^0
-TOTAL       :     3.414865 sec
+TOTAL       :     3.415254 sec
 INFO: No Floating Point Exceptions have been reported
-    10,356,902,074      cycles                           #    3.029 GHz                    
-    22,374,508,098      instructions                     #    2.16  insn per cycle         
-       3.420105986 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1969) (avx2:    0) (512y:    0) (512z:    0)
+    10,316,548,749      cycles                           #    3.017 GHz                    
+    22,369,828,182      instructions                     #    2.17  insn per cycle         
+       3.420542694 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1957) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.458189e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.760875e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.760875e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.440596e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.750420e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.750420e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371885e-02 +- 3.270112e-06 )  GeV^0
-TOTAL       :     3.251788 sec
+TOTAL       :     3.274423 sec
 INFO: No Floating Point Exceptions have been reported
-     9,410,320,620      cycles                           #    2.890 GHz                    
-    15,667,209,118      instructions                     #    1.66  insn per cycle         
-       3.257075709 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2608) (512y:    0) (512z:    0)
+     9,443,732,115      cycles                           #    2.881 GHz                    
+    15,660,089,896      instructions                     #    1.66  insn per cycle         
+       3.279649935 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2595) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.506691e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.885597e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.885597e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.490204e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.861466e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.861466e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371885e-02 +- 3.270112e-06 )  GeV^0
-TOTAL       :     3.208545 sec
+TOTAL       :     3.226764 sec
 INFO: No Floating Point Exceptions have been reported
-     9,330,497,019      cycles                           #    2.904 GHz                    
-    15,326,133,128      instructions                     #    1.64  insn per cycle         
-       3.213770933 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2516) (512y:   12) (512z:    0)
+     9,373,690,310      cycles                           #    2.901 GHz                    
+    15,311,292,063      instructions                     #    1.63  insn per cycle         
+       3.231783686 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2500) (512y:   12) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.572156e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.955274e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.955274e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.539604e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.891988e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.891988e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371885e-02 +- 3.270112e-06 )  GeV^0
-TOTAL       :     3.139046 sec
+TOTAL       :     3.181070 sec
 INFO: No Floating Point Exceptions have been reported
-     7,661,751,104      cycles                           #    2.437 GHz                    
-    12,572,532,310      instructions                     #    1.64  insn per cycle         
-       3.144282241 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1736) (512y:   17) (512z: 1439)
+     7,641,722,393      cycles                           #    2.399 GHz                    
+    12,564,622,024      instructions                     #    1.64  insn per cycle         
+       3.186357864 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1728) (512y:   17) (512z: 1439)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt
index 97c35a5219..88892aa3af 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-06-28_20:58:02
+DATE: 2024-08-08_20:25:48
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.318335e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.194026e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.156430e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.382651e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.206198e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.156880e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371687e-02 +- 3.270220e-06 )  GeV^0
-TOTAL       :     0.844316 sec
+TOTAL       :     0.845509 sec
 INFO: No Floating Point Exceptions have been reported
-     3,204,326,437      cycles                           #    3.008 GHz                    
-     6,522,971,608      instructions                     #    2.04  insn per cycle         
-       1.121979744 seconds time elapsed
+     3,157,288,524      cycles                           #    2.956 GHz                    
+     6,452,716,967      instructions                     #    2.04  insn per cycle         
+       1.124028974 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 121
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.098591e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.298491e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.298491e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.102313e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.299140e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.299140e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     6.087920 sec
+TOTAL       :     6.067726 sec
 INFO: No Floating Point Exceptions have been reported
-    18,281,240,224      cycles                           #    3.001 GHz                    
-    45,004,706,136      instructions                     #    2.46  insn per cycle         
-       6.093197293 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  424) (avx2:    0) (512y:    0) (512z:    0)
+    18,241,926,835      cycles                           #    3.004 GHz                    
+    44,997,190,895      instructions                     #    2.47  insn per cycle         
+       6.073021817 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  411) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.344868e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.585602e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.585602e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.262484e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.452586e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.452586e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.044494 sec
+TOTAL       :     3.153640 sec
 INFO: No Floating Point Exceptions have been reported
-     9,316,186,672      cycles                           #    3.055 GHz                    
-    22,293,378,605      instructions                     #    2.39  insn per cycle         
-       3.049801016 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1969) (avx2:    0) (512y:    0) (512z:    0)
+     9,294,014,762      cycles                           #    2.943 GHz                    
+    22,288,953,735      instructions                     #    2.40  insn per cycle         
+       3.158807454 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1957) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.511279e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.866100e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.866100e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.393307e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.660811e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.660811e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.864030 sec
+TOTAL       :     3.002727 sec
 INFO: No Floating Point Exceptions have been reported
-     8,395,420,887      cycles                           #    2.927 GHz                    
-    15,755,249,273      instructions                     #    1.88  insn per cycle         
-       2.869286108 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2608) (512y:    0) (512z:    0)
+     8,431,789,445      cycles                           #    2.804 GHz                    
+    15,745,619,364      instructions                     #    1.87  insn per cycle         
+       3.007966059 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2595) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.550336e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.966675e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.966675e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.401412e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.704220e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.704220e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.820910 sec
+TOTAL       :     2.993880 sec
 INFO: No Floating Point Exceptions have been reported
-     8,300,103,012      cycles                           #    2.938 GHz                    
-    15,610,356,251      instructions                     #    1.88  insn per cycle         
-       2.826135977 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2516) (512y:   12) (512z:    0)
+     8,307,647,714      cycles                           #    2.771 GHz                    
+    15,598,428,137      instructions                     #    1.88  insn per cycle         
+       2.998876053 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2500) (512y:   12) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.570643e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.939319e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.939319e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.569189e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.940564e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.940564e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.807069 sec
+TOTAL       :     2.807856 sec
 INFO: No Floating Point Exceptions have been reported
-     6,622,447,715      cycles                           #    2.355 GHz                    
-    12,862,667,527      instructions                     #    1.94  insn per cycle         
-       2.812373511 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1736) (512y:   17) (512z: 1439)
+     6,608,078,812      cycles                           #    2.350 GHz                    
+    12,854,592,970      instructions                     #    1.95  insn per cycle         
+       2.812995127 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1728) (512y:   17) (512z: 1439)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt
index ef231365e8..9b85e8bca9 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-06-28_20:55:22
+DATE: 2024-08-08_20:23:04
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,22 +50,24 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.303769e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.151846e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.015925e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.140303e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.190749e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.050049e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371710e-02 +- 3.270389e-06 )  GeV^0
-TOTAL       :     1.455812 sec
+TOTAL       :     1.475514 sec
 INFO: No Floating Point Exceptions have been reported
-     5,035,703,348      cycles                           #    2.998 GHz                    
-     9,233,292,656      instructions                     #    1.83  insn per cycle         
-       1.735608374 seconds time elapsed
+     5,002,845,340      cycles                           #    2.948 GHz                    
+     9,174,343,943      instructions                     #    1.83  insn per cycle         
+       1.753614320 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 121
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -82,20 +84,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.111353e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.309700e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.309700e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.100425e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.302255e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.302255e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     6.015558 sec
+TOTAL       :     6.083427 sec
 INFO: No Floating Point Exceptions have been reported
-    18,261,729,705      cycles                           #    3.034 GHz                    
-    45,005,382,332      instructions                     #    2.46  insn per cycle         
-       6.020795485 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  424) (avx2:    0) (512y:    0) (512z:    0)
+    18,286,986,421      cycles                           #    3.004 GHz                    
+    44,997,971,916      instructions                     #    2.46  insn per cycle         
+       6.088650881 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  411) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -110,20 +113,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.356374e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.602029e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.602029e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.314534e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.542028e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.542028e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.032852 sec
+TOTAL       :     3.081783 sec
 INFO: No Floating Point Exceptions have been reported
-     9,274,537,984      cycles                           #    3.053 GHz                    
-    22,293,508,856      instructions                     #    2.40  insn per cycle         
-       3.038261520 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1969) (avx2:    0) (512y:    0) (512z:    0)
+     9,321,092,178      cycles                           #    3.020 GHz                    
+    22,287,543,522      instructions                     #    2.39  insn per cycle         
+       3.087086590 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1957) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -138,20 +142,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.521232e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.881344e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.881344e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.473883e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.791063e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.791063e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.849181 sec
+TOTAL       :     2.904887 sec
 INFO: No Floating Point Exceptions have been reported
-     8,409,015,977      cycles                           #    2.947 GHz                    
-    15,756,716,712      instructions                     #    1.87  insn per cycle         
-       2.854559012 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2608) (512y:    0) (512z:    0)
+     8,410,533,055      cycles                           #    2.892 GHz                    
+    15,745,298,993      instructions                     #    1.87  insn per cycle         
+       2.910034115 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2595) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -166,20 +171,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.555493e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.967215e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.967215e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.505951e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.882287e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.882287e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.817657 sec
+TOTAL       :     2.874716 sec
 INFO: No Floating Point Exceptions have been reported
-     8,284,280,666      cycles                           #    2.936 GHz                    
-    15,609,086,887      instructions                     #    1.88  insn per cycle         
-       2.822871000 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2516) (512y:   12) (512z:    0)
+     8,289,781,145      cycles                           #    2.880 GHz                    
+    15,603,340,875      instructions                     #    1.88  insn per cycle         
+       2.879926744 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2500) (512y:   12) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -194,20 +200,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.591429e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.994264e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.994264e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.541059e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.907885e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.907885e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.786817 sec
+TOTAL       :     2.838024 sec
 INFO: No Floating Point Exceptions have been reported
-     6,634,065,696      cycles                           #    2.377 GHz                    
-    12,862,560,130      instructions                     #    1.94  insn per cycle         
-       2.792188382 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1736) (512y:   17) (512z: 1439)
+     6,642,493,654      cycles                           #    2.337 GHz                    
+    12,855,006,533      instructions                     #    1.94  insn per cycle         
+       2.843273121 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1728) (512y:   17) (512z: 1439)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt
index b94fdd94e9..1d6c5eac35 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-06-28_20:16:09
+DATE: 2024-08-08_19:49:17
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.384600e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.250346e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.207785e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.538728e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.270981e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.213583e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371687e-02 +- 3.270220e-06 )  GeV^0
-TOTAL       :     0.570828 sec
+TOTAL       :     0.564431 sec
 INFO: No Floating Point Exceptions have been reported
-     2,356,568,658      cycles                           #    2.967 GHz                    
-     3,666,157,119      instructions                     #    1.56  insn per cycle         
-       0.851031757 seconds time elapsed
+     2,335,295,476      cycles                           #    2.965 GHz                    
+     3,628,047,058      instructions                     #    1.55  insn per cycle         
+       0.844723791 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 95
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.109464e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.310084e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.310084e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.105961e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.305064e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.305064e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     6.027157 sec
+TOTAL       :     6.061656 sec
 INFO: No Floating Point Exceptions have been reported
-    18,232,427,029      cycles                           #    3.023 GHz                    
-    44,978,661,516      instructions                     #    2.47  insn per cycle         
-       6.032361273 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  410) (avx2:    0) (512y:    0) (512z:    0)
+    18,285,648,193      cycles                           #    3.014 GHz                    
+    45,012,181,796      instructions                     #    2.46  insn per cycle         
+       6.068344943 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  397) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.329924e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.545800e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.545800e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.291804e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.489005e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.489005e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.061730 sec
+TOTAL       :     3.124490 sec
 INFO: No Floating Point Exceptions have been reported
-     9,324,636,350      cycles                           #    3.041 GHz                    
-    22,261,175,312      instructions                     #    2.39  insn per cycle         
-       3.067057970 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1952) (avx2:    0) (512y:    0) (512z:    0)
+     9,410,134,292      cycles                           #    3.006 GHz                    
+    22,303,224,878      instructions                     #    2.37  insn per cycle         
+       3.131481201 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1940) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.485254e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.804597e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.804597e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.475997e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.815316e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.815316e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.895885 sec
+TOTAL       :     2.909295 sec
 INFO: No Floating Point Exceptions have been reported
-     8,404,314,376      cycles                           #    2.898 GHz                    
-    15,749,899,686      instructions                     #    1.87  insn per cycle         
-       2.901068192 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2583) (512y:    0) (512z:    0)
+     8,493,085,415      cycles                           #    2.913 GHz                    
+    15,781,425,735      instructions                     #    1.86  insn per cycle         
+       2.916002973 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2570) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.546580e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.928183e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.928183e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.513335e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.913286e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.913286e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.826440 sec
+TOTAL       :     2.878218 sec
 INFO: No Floating Point Exceptions have been reported
-     8,267,422,079      cycles                           #    2.921 GHz                    
-    15,597,882,535      instructions                     #    1.89  insn per cycle         
-       2.831511381 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2485) (512y:   12) (512z:    0)
+     8,394,171,701      cycles                           #    2.911 GHz                    
+    15,627,283,272      instructions                     #    1.86  insn per cycle         
+       2.884835196 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2469) (512y:   12) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.588361e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.981784e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.981784e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.564665e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.956343e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.956343e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.789975 sec
+TOTAL       :     2.826301 sec
 INFO: No Floating Point Exceptions have been reported
-     6,598,276,864      cycles                           #    2.361 GHz                    
-    12,843,019,056      instructions                     #    1.95  insn per cycle         
-       2.795162036 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1706) (512y:   18) (512z: 1427)
+     6,645,156,055      cycles                           #    2.346 GHz                    
+    12,878,593,303      instructions                     #    1.94  insn per cycle         
+       2.832875887 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1698) (512y:   18) (512z: 1427)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt
index 8910beeb75..2b62892e6a 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-06-28_20:40:58
+DATE: 2024-08-08_20:08:10
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.302115e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.150174e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.143059e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.451320e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.231819e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.130769e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371687e-02 +- 3.270220e-06 )  GeV^0
-TOTAL       :     0.584501 sec
+TOTAL       :     0.567390 sec
 INFO: No Floating Point Exceptions have been reported
-     2,297,094,729      cycles                           #    2.831 GHz                    
-     3,594,137,904      instructions                     #    1.56  insn per cycle         
-       0.868040702 seconds time elapsed
+     2,325,688,868      cycles                           #    2.936 GHz                    
+     3,579,904,434      instructions                     #    1.54  insn per cycle         
+       0.848470717 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 121
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.706876e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.240008e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.240008e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.665768e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.163815e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.163815e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     4.041260 sec
+TOTAL       :     4.146283 sec
 INFO: No Floating Point Exceptions have been reported
-    12,193,704,812      cycles                           #    3.014 GHz                    
-    32,189,168,438      instructions                     #    2.64  insn per cycle         
-       4.046515911 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  303) (avx2:    0) (512y:    0) (512z:    0)
+    12,236,614,644      cycles                           #    2.947 GHz                    
+    32,269,366,728      instructions                     #    2.64  insn per cycle         
+       4.152494891 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  290) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.791622e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.749217e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.749217e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.716868e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.596230e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.596230e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     2.617891 sec
+TOTAL       :     2.692634 sec
 INFO: No Floating Point Exceptions have been reported
-     7,979,787,357      cycles                           #    3.043 GHz                    
-    18,695,484,383      instructions                     #    2.34  insn per cycle         
-       2.623236074 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1560) (avx2:    0) (512y:    0) (512z:    0)
+     8,040,413,978      cycles                           #    2.980 GHz                    
+    18,731,295,679      instructions                     #    2.33  insn per cycle         
+       2.699009464 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1548) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.915587e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.844943e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.844943e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.823808e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.734147e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.734147e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.513077 sec
+TOTAL       :     2.599488 sec
 INFO: No Floating Point Exceptions have been reported
-     7,423,731,626      cycles                           #    2.949 GHz                    
-    14,245,306,500      instructions                     #    1.92  insn per cycle         
-       2.518170239 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2237) (512y:    0) (512z:    0)
+     7,529,267,846      cycles                           #    2.890 GHz                    
+    14,278,306,013      instructions                     #    1.90  insn per cycle         
+       2.606005161 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2222) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.941748e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.948410e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.948410e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.881055e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.928068e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.928068e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.498266 sec
+TOTAL       :     2.551515 sec
 INFO: No Floating Point Exceptions have been reported
-     7,289,469,121      cycles                           #    2.913 GHz                    
-    13,941,958,463      instructions                     #    1.91  insn per cycle         
-       2.503658148 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2094) (512y:    3) (512z:    0)
+     7,444,338,967      cycles                           #    2.911 GHz                    
+    13,969,219,259      instructions                     #    1.88  insn per cycle         
+       2.557876734 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2074) (512y:    3) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.633828e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.112002e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.112002e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.593244e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.031185e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.031185e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.747861 sec
+TOTAL       :     2.800385 sec
 INFO: No Floating Point Exceptions have been reported
-     6,514,962,868      cycles                           #    2.368 GHz                    
-    13,424,477,335      instructions                     #    2.06  insn per cycle         
-       2.753003173 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2073) (512y:    1) (512z: 1197)
+     6,564,002,113      cycles                           #    2.339 GHz                    
+    13,450,088,279      instructions                     #    2.05  insn per cycle         
+       2.806913095 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2056) (512y:    1) (512z: 1197)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt
index 2b9c5c9dab..5ae8d74446 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-06-28_20:41:20
+DATE: 2024-08-08_20:08:33
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.300237e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.170808e+09                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.204667e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.456866e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.267705e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.218590e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371687e-02 +- 3.270220e-06 )  GeV^0
-TOTAL       :     0.573714 sec
+TOTAL       :     0.568736 sec
 INFO: No Floating Point Exceptions have been reported
-     2,371,507,340      cycles                           #    2.957 GHz                    
-     3,677,447,642      instructions                     #    1.55  insn per cycle         
-       0.858936652 seconds time elapsed
+     2,333,386,939      cycles                           #    2.946 GHz                    
+     3,651,568,314      instructions                     #    1.56  insn per cycle         
+       0.849375970 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 95
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.261649e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.281605e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.281605e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.283106e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.333262e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.333262e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.141273 sec
+TOTAL       :     3.121614 sec
 INFO: No Floating Point Exceptions have been reported
-     9,341,904,734      cycles                           #    2.970 GHz                    
-    25,627,115,270      instructions                     #    2.74  insn per cycle         
-       3.146581121 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  256) (avx2:    0) (512y:    0) (512z:    0)
+     9,386,181,268      cycles                           #    3.002 GHz                    
+    25,683,181,247      instructions                     #    2.74  insn per cycle         
+       3.127889698 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  243) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.067172e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.682568e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.682568e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.093996e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.729930e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.729930e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     2.420676 sec
+TOTAL       :     2.404675 sec
 INFO: No Floating Point Exceptions have been reported
-     7,248,612,827      cycles                           #    2.989 GHz                    
-    16,867,376,247      instructions                     #    2.33  insn per cycle         
-       2.425928290 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1362) (avx2:    0) (512y:    0) (512z:    0)
+     7,273,765,849      cycles                           #    3.018 GHz                    
+    16,902,173,009      instructions                     #    2.32  insn per cycle         
+       2.411177480 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1350) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.035973e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.218524e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.218524e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.955814e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.106638e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.106638e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.431224 sec
+TOTAL       :     2.499207 sec
 INFO: No Floating Point Exceptions have been reported
-     7,140,137,122      cycles                           #    2.932 GHz                    
-    13,623,202,012      instructions                     #    1.91  insn per cycle         
-       2.436415043 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2061) (512y:    0) (512z:    0)
+     7,265,897,672      cycles                           #    2.902 GHz                    
+    13,654,744,957      instructions                     #    1.88  insn per cycle         
+       2.505830767 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2046) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.082251e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.384683e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.384683e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.024505e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.340418e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.340418e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.398263 sec
+TOTAL       :     2.448205 sec
 INFO: No Floating Point Exceptions have been reported
-     7,046,497,716      cycles                           #    2.933 GHz                    
-    13,426,599,430      instructions                     #    1.91  insn per cycle         
-       2.403454610 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1947) (512y:    4) (512z:    0)
+     7,137,327,072      cycles                           #    2.909 GHz                    
+    13,455,725,408      instructions                     #    1.89  insn per cycle         
+       2.454335523 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1927) (512y:    4) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.766798e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.419116e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.419116e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.717556e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.328622e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.328622e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.633986 sec
+TOTAL       :     2.693340 sec
 INFO: No Floating Point Exceptions have been reported
-     6,317,197,476      cycles                           #    2.394 GHz                    
-    13,153,165,220      instructions                     #    2.08  insn per cycle         
-       2.639245137 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2029) (512y:    1) (512z: 1083)
+     6,390,724,476      cycles                           #    2.368 GHz                    
+    13,180,968,753      instructions                     #    2.06  insn per cycle         
+       2.699833523 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2012) (512y:    1) (512z: 1083)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
index 5159836f27..dec1886a20 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-06-28_20:16:34
+DATE: 2024-08-08_19:49:43
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.097692e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.855962e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.153097e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.471546e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.855416e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.166311e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.674608 sec
+TOTAL       :     0.664565 sec
 INFO: No Floating Point Exceptions have been reported
-     2,603,717,798      cycles                           #    2.865 GHz                    
-     4,140,407,206      instructions                     #    1.59  insn per cycle         
-       0.968093967 seconds time elapsed
+     2,673,452,306      cycles                           #    2.953 GHz                    
+     4,096,581,433      instructions                     #    1.53  insn per cycle         
+       0.967198892 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.055185e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.225988e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.225988e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.042304e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.212707e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.212707e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.362286 sec
+TOTAL       :     6.467559 sec
 INFO: No Floating Point Exceptions have been reported
-    19,341,325,065      cycles                           #    3.038 GHz                    
-    46,294,100,838      instructions                     #    2.39  insn per cycle         
-       6.367819732 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  479) (avx2:    0) (512y:    0) (512z:    0)
+    19,491,750,695      cycles                           #    3.010 GHz                    
+    46,366,168,986      instructions                     #    2.38  insn per cycle         
+       6.476541865 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  466) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.685403e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.221591e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.221591e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.662736e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.194123e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.194123e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.135881 sec
+TOTAL       :     4.219503 sec
 INFO: No Floating Point Exceptions have been reported
-    12,592,214,789      cycles                           #    3.041 GHz                    
-    31,477,122,585      instructions                     #    2.50  insn per cycle         
-       4.141519100 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1732) (avx2:    0) (512y:    0) (512z:    0)
+    12,706,673,121      cycles                           #    3.006 GHz                    
+    31,586,088,348      instructions                     #    2.49  insn per cycle         
+       4.228514763 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1720) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.053122e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.867793e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.867793e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.015466e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.812156e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.812156e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.464196 sec
+TOTAL       :     3.548784 sec
 INFO: No Floating Point Exceptions have been reported
-    10,097,562,543      cycles                           #    2.911 GHz                    
-    19,468,852,516      instructions                     #    1.93  insn per cycle         
-       3.469759834 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2133) (512y:    0) (512z:    0)
+    10,222,806,702      cycles                           #    2.874 GHz                    
+    19,575,907,459      instructions                     #    1.91  insn per cycle         
+       3.557713338 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2123) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.083342e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.922426e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.922426e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.051557e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.890469e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.890469e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.419745 sec
+TOTAL       :     3.498884 sec
 INFO: No Floating Point Exceptions have been reported
-     9,936,173,470      cycles                           #    2.902 GHz                    
-    19,218,686,238      instructions                     #    1.93  insn per cycle         
-       3.425268707 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1874) (512y:  189) (512z:    0)
+    10,092,991,859      cycles                           #    2.879 GHz                    
+    19,324,671,897      instructions                     #    1.91  insn per cycle         
+       3.507900575 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1866) (512y:  189) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.875409e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.530800e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.530800e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.882298e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.563573e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.563573e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.761394 sec
+TOTAL       :     3.772337 sec
 INFO: No Floating Point Exceptions have been reported
-     8,401,346,330      cycles                           #    2.231 GHz                    
-    15,063,802,925      instructions                     #    1.79  insn per cycle         
-       3.766578754 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1028) (512y:  154) (512z: 1321)
+     8,566,798,073      cycles                           #    2.266 GHz                    
+    15,161,524,534      instructions                     #    1.77  insn per cycle         
+       3.781171342 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1044) (512y:  154) (512z: 1321)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt
index e166c6fc83..e7689b72e7 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-06-28_20:17:04
+DATE: 2024-08-08_19:50:13
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.133743e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.855252e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.188297e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.539005e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.550707e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.172141e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.666632 sec
+TOTAL       :     0.661474 sec
 INFO: No Floating Point Exceptions have been reported
-     2,676,885,399      cycles                           #    2.944 GHz                    
-     4,145,864,141      instructions                     #    1.55  insn per cycle         
-       0.969606465 seconds time elapsed
+     2,649,580,670      cycles                           #    2.965 GHz                    
+     4,041,332,680      instructions                     #    1.53  insn per cycle         
+       0.953046472 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.050080e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.221070e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.221070e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.034608e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.202440e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.202440e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.394211 sec
+TOTAL       :     6.513220 sec
 INFO: No Floating Point Exceptions have been reported
-    19,359,472,456      cycles                           #    3.025 GHz                    
-    46,231,043,339      instructions                     #    2.39  insn per cycle         
-       6.399922515 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  466) (avx2:    0) (512y:    0) (512z:    0)
+    19,609,702,737      cycles                           #    3.007 GHz                    
+    46,307,035,647      instructions                     #    2.36  insn per cycle         
+       6.522463944 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  453) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.676916e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.213764e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.213764e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.657659e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.187172e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.187172e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.157495 sec
+TOTAL       :     4.231188 sec
 INFO: No Floating Point Exceptions have been reported
-    12,636,464,470      cycles                           #    3.036 GHz                    
-    31,450,700,410      instructions                     #    2.49  insn per cycle         
-       4.162847802 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1724) (avx2:    0) (512y:    0) (512z:    0)
+    12,732,843,853      cycles                           #    3.004 GHz                    
+    31,560,321,434      instructions                     #    2.48  insn per cycle         
+       4.240067788 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1712) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.050588e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.855873e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.855873e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.029457e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.843800e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.843800e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.469434 sec
+TOTAL       :     3.528354 sec
 INFO: No Floating Point Exceptions have been reported
-    10,081,149,851      cycles                           #    2.902 GHz                    
-    19,455,615,991      instructions                     #    1.93  insn per cycle         
-       3.474830271 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2117) (512y:    0) (512z:    0)
+    10,258,124,960      cycles                           #    2.901 GHz                    
+    19,565,249,837      instructions                     #    1.91  insn per cycle         
+       3.537275385 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2107) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.076558e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.911672e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.911672e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.049544e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.886035e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.886035e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.433307 sec
+TOTAL       :     3.497781 sec
 INFO: No Floating Point Exceptions have been reported
-     9,909,907,880      cycles                           #    2.882 GHz                    
-    19,284,186,760      instructions                     #    1.95  insn per cycle         
-       3.438922876 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1868) (512y:  189) (512z:    0)
+    10,124,826,634      cycles                           #    2.887 GHz                    
+    19,390,299,312      instructions                     #    1.92  insn per cycle         
+       3.507669206 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1860) (512y:  189) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.915935e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.596273e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.596273e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.905533e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.593731e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.593731e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.685671 sec
+TOTAL       :     3.733502 sec
 INFO: No Floating Point Exceptions have been reported
-     8,279,654,546      cycles                           #    2.244 GHz                    
-    14,978,357,448      instructions                     #    1.81  insn per cycle         
-       3.691061407 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1021) (512y:  156) (512z: 1305)
+     8,422,503,642      cycles                           #    2.251 GHz                    
+    15,074,129,788      instructions                     #    1.79  insn per cycle         
+       3.742530520 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1037) (512y:  156) (512z: 1305)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
index f2e16bc3a4..34e03e8fe4 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-06-28_20:17:33
+DATE: 2024-08-08_19:50:43
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.887654e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.177294e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.279238e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.015578e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.167678e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.279582e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.527287 sec
+TOTAL       :     0.520495 sec
 INFO: No Floating Point Exceptions have been reported
-     2,162,264,043      cycles                           #    2.837 GHz                    
-     3,099,754,233      instructions                     #    1.43  insn per cycle         
-       0.818700934 seconds time elapsed
+     2,215,808,169      cycles                           #    2.946 GHz                    
+     3,187,450,258      instructions                     #    1.44  insn per cycle         
+       0.809093508 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.882060e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.931187e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.931187e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.870302e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.920397e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.920397e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.678336 sec
+TOTAL       :     5.747684 sec
 INFO: No Floating Point Exceptions have been reported
-    17,196,505,278      cycles                           #    3.026 GHz                    
-    45,941,679,290      instructions                     #    2.67  insn per cycle         
-       5.684096482 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  631) (avx2:    0) (512y:    0) (512z:    0)
+    17,324,193,414      cycles                           #    3.009 GHz                    
+    46,060,464,647      instructions                     #    2.66  insn per cycle         
+       5.757711057 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  618) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.295547e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.463909e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.463909e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.256365e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.416045e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.416045e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.291073 sec
+TOTAL       :     3.359278 sec
 INFO: No Floating Point Exceptions have been reported
-    10,024,978,903      cycles                           #    3.043 GHz                    
-    27,842,089,342      instructions                     #    2.78  insn per cycle         
-       3.296637646 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2546) (avx2:    0) (512y:    0) (512z:    0)
+    10,153,117,527      cycles                           #    3.015 GHz                    
+    27,956,665,962      instructions                     #    2.75  insn per cycle         
+       3.369058986 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2534) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.233679e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.639497e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.639497e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.128206e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.537547e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.537547e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.110457 sec
+TOTAL       :     2.182924 sec
 INFO: No Floating Point Exceptions have been reported
-     6,078,030,978      cycles                           #    2.873 GHz                    
-    12,585,249,314      instructions                     #    2.07  insn per cycle         
-       2.116164658 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2695) (512y:    0) (512z:    0)
+     6,226,289,605      cycles                           #    2.841 GHz                    
+    12,698,897,797      instructions                     #    2.04  insn per cycle         
+       2.192278719 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2612) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.733341e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.227942e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.227942e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.605220e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.105851e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.105851e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.936443 sec
+TOTAL       :     2.009834 sec
 INFO: No Floating Point Exceptions have been reported
-     5,569,510,614      cycles                           #    2.869 GHz                    
-    12,022,484,967      instructions                     #    2.16  insn per cycle         
-       1.942002573 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2438) (512y:  144) (512z:    0)
+     5,688,710,640      cycles                           #    2.818 GHz                    
+    12,134,437,252      instructions                     #    2.13  insn per cycle         
+       2.019506075 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2350) (512y:  144) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.719724e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.922615e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.922615e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.669310e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.868262e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.868262e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.926284 sec
+TOTAL       :     2.997018 sec
 INFO: No Floating Point Exceptions have been reported
-     5,708,872,761      cycles                           #    1.948 GHz                    
-     8,296,017,452      instructions                     #    1.45  insn per cycle         
-       2.931816776 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1459) (512y:  122) (512z: 1801)
+     5,821,558,239      cycles                           #    1.938 GHz                    
+     8,411,130,761      instructions                     #    1.44  insn per cycle         
+       3.006784964 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1429) (512y:  122) (512z: 1801)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt
index 2020a39d40..20904d51fd 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-06-28_20:50:20
+DATE: 2024-08-08_20:17:37
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.667501e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.207084e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.207084e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.670983e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.294260e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.294260e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.803011 sec
+TOTAL       :     0.801021 sec
 INFO: No Floating Point Exceptions have been reported
-     3,110,018,268      cycles                           #    2.979 GHz                    
-     4,839,889,726      instructions                     #    1.56  insn per cycle         
-       1.102667488 seconds time elapsed
+     3,080,158,706      cycles                           #    2.935 GHz                    
+     4,797,683,266      instructions                     #    1.56  insn per cycle         
+       1.107754362 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -70,8 +70,10 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -89,20 +91,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.881108e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.929315e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.929315e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.860613e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.909257e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.909257e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.762355 sec
+TOTAL       :     5.862697 sec
 INFO: No Floating Point Exceptions have been reported
-    17,562,906,762      cycles                           #    3.045 GHz                    
-    46,001,411,072      instructions                     #    2.62  insn per cycle         
-       5.769657879 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  631) (avx2:    0) (512y:    0) (512z:    0)
+    17,649,346,443      cycles                           #    3.005 GHz                    
+    46,130,000,854      instructions                     #    2.61  insn per cycle         
+       5.874952134 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  618) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -118,20 +121,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.229172e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.389158e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.389158e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.216658e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.372905e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.372905e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.435776 sec
+TOTAL       :     3.488934 sec
 INFO: No Floating Point Exceptions have been reported
-    10,363,946,589      cycles                           #    3.011 GHz                    
-    28,026,973,873      instructions                     #    2.70  insn per cycle         
-       3.442976592 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2546) (avx2:    0) (512y:    0) (512z:    0)
+    10,528,637,782      cycles                           #    3.008 GHz                    
+    28,161,635,226      instructions                     #    2.67  insn per cycle         
+       3.501603953 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2534) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -147,20 +151,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.097098e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.485023e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.485023e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.020861e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.404928e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.404928e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.247172 sec
+TOTAL       :     2.319862 sec
 INFO: No Floating Point Exceptions have been reported
-     6,480,280,232      cycles                           #    2.876 GHz                    
-    12,872,958,029      instructions                     #    1.99  insn per cycle         
-       2.254448726 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2695) (512y:    0) (512z:    0)
+     6,615,013,287      cycles                           #    2.835 GHz                    
+    13,014,509,842      instructions                     #    1.97  insn per cycle         
+       2.334044597 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2612) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -176,20 +181,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.611591e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.073609e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.073609e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.540790e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.009639e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.009639e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.055493 sec
+TOTAL       :     2.122719 sec
 INFO: No Floating Point Exceptions have been reported
-     5,963,252,373      cycles                           #    2.892 GHz                    
-    12,307,269,566      instructions                     #    2.06  insn per cycle         
-       2.062562699 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2438) (512y:  144) (512z:    0)
+     6,074,435,637      cycles                           #    2.845 GHz                    
+    12,446,562,239      instructions                     #    2.05  insn per cycle         
+       2.135603783 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2350) (512y:  144) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -205,20 +211,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.494504e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.674936e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.674936e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.615591e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.807268e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.807268e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.197110 sec
+TOTAL       :     3.133757 sec
 INFO: No Floating Point Exceptions have been reported
-     6,101,726,119      cycles                           #    1.909 GHz                    
-     8,542,052,342      instructions                     #    1.40  insn per cycle         
-       3.204459961 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1459) (512y:  122) (512z: 1801)
+     6,213,946,932      cycles                           #    1.975 GHz                    
+     8,678,322,888      instructions                     #    1.40  insn per cycle         
+       3.146596624 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1429) (512y:  122) (512z: 1801)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt
index 786b897430..278ba4b157 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-06-28_21:01:12
+DATE: 2024-08-08_20:29:01
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.755803e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.168375e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.278071e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.861886e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.169373e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.276724e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     0.616757 sec
+TOTAL       :     0.622862 sec
 INFO: No Floating Point Exceptions have been reported
-     2,520,167,761      cycles                           #    2.979 GHz                    
-     3,681,164,233      instructions                     #    1.46  insn per cycle         
-       0.903907581 seconds time elapsed
+     2,496,588,832      cycles                           #    2.937 GHz                    
+     3,616,944,645      instructions                     #    1.45  insn per cycle         
+       0.908999824 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.896601e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.946013e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.946013e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.858770e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.906877e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.906877e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     5.694069 sec
+TOTAL       :     5.824941 sec
 INFO: No Floating Point Exceptions have been reported
-    17,355,363,175      cycles                           #    3.046 GHz                    
-    45,958,013,397      instructions                     #    2.65  insn per cycle         
-       5.699661141 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  631) (avx2:    0) (512y:    0) (512z:    0)
+    17,438,858,484      cycles                           #    2.991 GHz                    
+    46,011,567,715      instructions                     #    2.64  insn per cycle         
+       5.831016559 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  618) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.289445e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.461820e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.461820e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.238383e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.396939e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.396939e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     3.358370 sec
+TOTAL       :     3.423850 sec
 INFO: No Floating Point Exceptions have been reported
-    10,180,396,568      cycles                           #    3.027 GHz                    
-    27,841,400,891      instructions                     #    2.73  insn per cycle         
-       3.363996103 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2546) (avx2:    0) (512y:    0) (512z:    0)
+    10,272,842,406      cycles                           #    2.996 GHz                    
+    27,901,302,334      instructions                     #    2.72  insn per cycle         
+       3.429671541 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2534) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.198567e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.612974e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.612974e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.121821e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.516246e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.516246e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     2.185592 sec
+TOTAL       :     2.235299 sec
 INFO: No Floating Point Exceptions have been reported
-     6,286,458,818      cycles                           #    2.870 GHz                    
-    12,568,024,915      instructions                     #    2.00  insn per cycle         
-       2.191070415 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2695) (512y:    0) (512z:    0)
+     6,354,923,604      cycles                           #    2.835 GHz                    
+    12,634,246,195      instructions                     #    1.99  insn per cycle         
+       2.242096681 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2612) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.698104e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.182473e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.182473e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.585808e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.053603e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.053603e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     2.010132 sec
+TOTAL       :     2.059756 sec
 INFO: No Floating Point Exceptions have been reported
-     5,785,299,045      cycles                           #    2.872 GHz                    
-    11,972,108,812      instructions                     #    2.07  insn per cycle         
-       2.015720254 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2438) (512y:  144) (512z:    0)
+     5,815,690,450      cycles                           #    2.817 GHz                    
+    12,015,299,257      instructions                     #    2.07  insn per cycle         
+       2.065558377 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2350) (512y:  144) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.726032e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.924861e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.924861e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.643854e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.839235e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.839235e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     2.982171 sec
+TOTAL       :     3.061355 sec
 INFO: No Floating Point Exceptions have been reported
-     5,927,841,436      cycles                           #    1.985 GHz                    
-     8,247,377,383      instructions                     #    1.39  insn per cycle         
-       2.987679420 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1459) (512y:  122) (512z: 1801)
+     5,933,052,882      cycles                           #    1.935 GHz                    
+     8,290,148,322      instructions                     #    1.40  insn per cycle         
+       3.067159573 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1429) (512y:  122) (512z: 1801)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt
index 5f6d720529..fba3b57280 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-06-28_20:58:28
+DATE: 2024-08-08_20:26:14
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.724595e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.166575e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.277602e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.905617e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.179466e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.279851e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.558679 sec
+TOTAL       :     0.555994 sec
 INFO: No Floating Point Exceptions have been reported
-     2,338,202,954      cycles                           #    2.969 GHz                    
-     3,647,650,532      instructions                     #    1.56  insn per cycle         
-       0.844681870 seconds time elapsed
+     2,284,248,162      cycles                           #    2.910 GHz                    
+     3,522,733,929      instructions                     #    1.54  insn per cycle         
+       0.842109172 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.890469e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.939333e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.939333e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.864505e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.911828e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.911828e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.653742 sec
+TOTAL       :     5.728269 sec
 INFO: No Floating Point Exceptions have been reported
-    17,201,984,378      cycles                           #    3.040 GHz                    
-    45,941,543,910      instructions                     #    2.67  insn per cycle         
-       5.659339476 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  631) (avx2:    0) (512y:    0) (512z:    0)
+    17,201,286,704      cycles                           #    3.001 GHz                    
+    45,937,216,481      instructions                     #    2.67  insn per cycle         
+       5.733811627 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  618) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.308666e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.474491e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.474491e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.250062e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.410672e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.410672e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.276689 sec
+TOTAL       :     3.334295 sec
 INFO: No Floating Point Exceptions have been reported
-    10,031,778,452      cycles                           #    3.057 GHz                    
-    27,843,325,740      instructions                     #    2.78  insn per cycle         
-       3.282220552 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2546) (avx2:    0) (512y:    0) (512z:    0)
+    10,038,224,892      cycles                           #    3.006 GHz                    
+    27,841,209,673      instructions                     #    2.77  insn per cycle         
+       3.340129450 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2534) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.201736e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.604981e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.604981e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.145160e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.541205e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.541205e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.124462 sec
+TOTAL       :     2.147149 sec
 INFO: No Floating Point Exceptions have been reported
-     6,110,813,008      cycles                           #    2.870 GHz                    
-    12,584,901,063      instructions                     #    2.06  insn per cycle         
-       2.130089416 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2695) (512y:    0) (512z:    0)
+     6,102,474,947      cycles                           #    2.835 GHz                    
+    12,591,341,324      instructions                     #    2.06  insn per cycle         
+       2.153315340 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2612) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.746310e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.244054e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.244054e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.639021e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.126234e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.126234e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.934375 sec
+TOTAL       :     1.968961 sec
 INFO: No Floating Point Exceptions have been reported
-     5,581,726,506      cycles                           #    2.878 GHz                    
-    12,022,590,996      instructions                     #    2.15  insn per cycle         
-       1.940001347 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2438) (512y:  144) (512z:    0)
+     5,608,749,777      cycles                           #    2.841 GHz                    
+    12,024,185,128      instructions                     #    2.14  insn per cycle         
+       1.975078079 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2350) (512y:  144) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.758674e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.959974e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.959974e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.641587e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.834103e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.834103e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.897270 sec
+TOTAL       :     2.988248 sec
 INFO: No Floating Point Exceptions have been reported
-     5,727,759,334      cycles                           #    1.974 GHz                    
-     8,296,479,561      instructions                     #    1.45  insn per cycle         
-       2.902844422 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1459) (512y:  122) (512z: 1801)
+     5,720,578,029      cycles                           #    1.911 GHz                    
+     8,299,459,915      instructions                     #    1.45  insn per cycle         
+       2.994289958 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1429) (512y:  122) (512z: 1801)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt
index f356364f29..9e3fe4acb0 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-06-28_20:55:48
+DATE: 2024-08-08_20:23:31
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,22 +50,24 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.908224e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.168760e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.278988e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.032256e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.173338e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.277454e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.708513 sec
+TOTAL       :     0.705014 sec
 INFO: No Floating Point Exceptions have been reported
-     2,788,384,939      cycles                           #    2.977 GHz                    
-     4,369,585,987      instructions                     #    1.57  insn per cycle         
-       0.994652751 seconds time elapsed
+     2,749,776,676      cycles                           #    2.945 GHz                    
+     4,325,337,591      instructions                     #    1.57  insn per cycle         
+       0.991327218 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -82,20 +84,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.898323e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.947064e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.947064e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.868158e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.916528e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.916528e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.630810 sec
+TOTAL       :     5.717662 sec
 INFO: No Floating Point Exceptions have been reported
-    17,199,246,494      cycles                           #    3.053 GHz                    
-    45,942,002,374      instructions                     #    2.67  insn per cycle         
-       5.636616917 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  631) (avx2:    0) (512y:    0) (512z:    0)
+    17,178,289,091      cycles                           #    3.002 GHz                    
+    45,937,241,973      instructions                     #    2.67  insn per cycle         
+       5.723215350 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  618) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -110,20 +113,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.281603e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.445852e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.445852e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.231136e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.391441e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.391441e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.304211 sec
+TOTAL       :     3.354044 sec
 INFO: No Floating Point Exceptions have been reported
-    10,005,316,559      cycles                           #    3.024 GHz                    
-    27,841,805,216      instructions                     #    2.78  insn per cycle         
-       3.310216435 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2546) (avx2:    0) (512y:    0) (512z:    0)
+    10,031,479,526      cycles                           #    2.986 GHz                    
+    27,844,808,096      instructions                     #    2.78  insn per cycle         
+       3.359952965 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2534) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -138,20 +142,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.157548e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.551032e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.551032e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.099162e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.490827e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.490827e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.141315 sec
+TOTAL       :     2.161987 sec
 INFO: No Floating Point Exceptions have been reported
-     6,097,247,438      cycles                           #    2.841 GHz                    
-    12,585,204,824      instructions                     #    2.06  insn per cycle         
-       2.147048600 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2695) (512y:    0) (512z:    0)
+     6,083,392,852      cycles                           #    2.808 GHz                    
+    12,576,453,088      instructions                     #    2.07  insn per cycle         
+       2.167500908 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2612) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -166,20 +171,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.738562e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.232955e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.232955e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.632481e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.118699e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.118699e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.934942 sec
+TOTAL       :     1.966228 sec
 INFO: No Floating Point Exceptions have been reported
-     5,595,518,172      cycles                           #    2.884 GHz                    
-    12,020,747,019      instructions                     #    2.15  insn per cycle         
-       1.940748260 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2438) (512y:  144) (512z:    0)
+     5,587,261,117      cycles                           #    2.835 GHz                    
+    12,016,452,187      instructions                     #    2.15  insn per cycle         
+       1.971550633 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2350) (512y:  144) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -194,20 +200,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.575127e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.766861e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.766861e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.687020e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.882322e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.882322e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.043164 sec
+TOTAL       :     2.948998 sec
 INFO: No Floating Point Exceptions have been reported
-     5,727,292,763      cycles                           #    1.880 GHz                    
-     8,298,018,766      instructions                     #    1.45  insn per cycle         
-       3.048744350 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1459) (512y:  122) (512z: 1801)
+     5,710,948,756      cycles                           #    1.934 GHz                    
+     8,289,147,048      instructions                     #    1.45  insn per cycle         
+       2.954636423 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1429) (512y:  122) (512z: 1801)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt
index 886aa2766b..dd8639d462 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-06-28_20:17:58
+DATE: 2024-08-08_19:51:07
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.892000e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.175238e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.276354e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.953365e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.169057e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.275879e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.517809 sec
+TOTAL       :     0.516826 sec
 INFO: No Floating Point Exceptions have been reported
-     2,225,604,920      cycles                           #    2.968 GHz                    
-     3,221,516,712      instructions                     #    1.45  insn per cycle         
-       0.807006941 seconds time elapsed
+     2,205,203,774      cycles                           #    2.951 GHz                    
+     3,179,876,331      instructions                     #    1.44  insn per cycle         
+       0.803907668 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.950456e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.002156e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.002156e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.926342e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.977633e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.977633e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.482127 sec
+TOTAL       :     5.581069 sec
 INFO: No Floating Point Exceptions have been reported
-    16,704,752,814      cycles                           #    3.045 GHz                    
-    44,935,943,922      instructions                     #    2.69  insn per cycle         
-       5.487861081 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  581) (avx2:    0) (512y:    0) (512z:    0)
+    16,849,073,106      cycles                           #    3.014 GHz                    
+    45,045,731,432      instructions                     #    2.67  insn per cycle         
+       5.590685845 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  568) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.473548e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.656412e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.656412e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.423058e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.602908e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.602908e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.126169 sec
+TOTAL       :     3.201422 sec
 INFO: No Floating Point Exceptions have been reported
-     9,527,553,742      cycles                           #    3.043 GHz                    
-    26,700,808,818      instructions                     #    2.80  insn per cycle         
-       3.131739636 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2343) (avx2:    0) (512y:    0) (512z:    0)
+     9,674,035,774      cycles                           #    3.013 GHz                    
+    26,815,165,030      instructions                     #    2.77  insn per cycle         
+       3.211231348 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2331) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.762240e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.097629e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.097629e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.649217e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.990962e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.990962e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.308155 sec
+TOTAL       :     2.396466 sec
 INFO: No Floating Point Exceptions have been reported
-     6,597,424,472      cycles                           #    2.852 GHz                    
-    14,122,933,490      instructions                     #    2.14  insn per cycle         
-       2.313712672 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2786) (512y:    0) (512z:    0)
+     6,732,899,102      cycles                           #    2.799 GHz                    
+    14,237,973,279      instructions                     #    2.11  insn per cycle         
+       2.406196706 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2703) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.975505e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.343599e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.343599e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.923382e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.291610e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.291610e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.214876 sec
+TOTAL       :     2.269821 sec
 INFO: No Floating Point Exceptions have been reported
-     6,337,916,387      cycles                           #    2.856 GHz                    
-    13,710,130,386      instructions                     #    2.16  insn per cycle         
-       2.220426524 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2437) (512y:  297) (512z:    0)
+     6,473,185,925      cycles                           #    2.841 GHz                    
+    13,823,290,533      instructions                     #    2.14  insn per cycle         
+       2.279550700 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2349) (512y:  297) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.536585e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.718678e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.718678e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.570682e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.758312e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.758312e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.073115 sec
+TOTAL       :     3.077054 sec
 INFO: No Floating Point Exceptions have been reported
-     5,918,243,053      cycles                           #    1.923 GHz                    
-    10,064,016,017      instructions                     #    1.70  insn per cycle         
-       3.078700521 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1291) (512y:  208) (512z: 1987)
+     6,015,923,061      cycles                           #    1.950 GHz                    
+    10,176,638,000      instructions                     #    1.69  insn per cycle         
+       3.086647254 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1261) (512y:  208) (512z: 1987)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt
index 1f5555d1f3..1d562b1c51 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-06-28_20:41:41
+DATE: 2024-08-08_20:08:54
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.588122e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.164655e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.281010e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.079454e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.184027e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.281167e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.525240 sec
+TOTAL       :     0.525003 sec
 INFO: No Floating Point Exceptions have been reported
-     2,248,450,640      cycles                           #    2.948 GHz                    
-     3,243,362,318      instructions                     #    1.44  insn per cycle         
-       0.819380731 seconds time elapsed
+     2,200,806,347      cycles                           #    2.912 GHz                    
+     3,172,188,132      instructions                     #    1.44  insn per cycle         
+       0.814200484 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.496274e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.581602e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.581602e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.477886e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.565553e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.565553e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     4.308010 sec
+TOTAL       :     4.370943 sec
 INFO: No Floating Point Exceptions have been reported
-    13,011,911,332      cycles                           #    3.017 GHz                    
-    34,349,737,743      instructions                     #    2.64  insn per cycle         
-       4.313568297 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  678) (avx2:    0) (512y:    0) (512z:    0)
+    13,117,582,836      cycles                           #    2.995 GHz                    
+    34,450,679,536      instructions                     #    2.63  insn per cycle         
+       4.380756610 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  665) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.074252e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.216149e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.216149e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.033084e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.174712e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.174712e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.518078 sec
+TOTAL       :     3.593818 sec
 INFO: No Floating Point Exceptions have been reported
-    10,696,750,210      cycles                           #    3.037 GHz                    
-    24,006,049,306      instructions                     #    2.24  insn per cycle         
-       3.523505209 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2581) (avx2:    0) (512y:    0) (512z:    0)
+    10,811,449,443      cycles                           #    3.001 GHz                    
+    24,123,594,949      instructions                     #    2.23  insn per cycle         
+       3.603506153 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2571) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.789528e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.126618e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.126618e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.731678e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.069353e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.069353e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.295706 sec
+TOTAL       :     2.354482 sec
 INFO: No Floating Point Exceptions have been reported
-     6,575,533,125      cycles                           #    2.859 GHz                    
-    12,347,200,737      instructions                     #    1.88  insn per cycle         
-       2.301423963 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3121) (512y:    0) (512z:    0)
+     6,707,294,523      cycles                           #    2.838 GHz                    
+    12,465,505,098      instructions                     #    1.86  insn per cycle         
+       2.364349203 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3096) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.011477e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.379769e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.379769e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.061977e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.447561e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.447561e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.199416 sec
+TOTAL       :     2.207748 sec
 INFO: No Floating Point Exceptions have been reported
-     6,148,102,598      cycles                           #    2.790 GHz                    
-    11,570,413,212      instructions                     #    1.88  insn per cycle         
-       2.204830710 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2671) (512y:  239) (512z:    0)
+     6,305,288,080      cycles                           #    2.845 GHz                    
+    11,685,678,996      instructions                     #    1.85  insn per cycle         
+       2.217142463 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2640) (512y:  239) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.881899e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.105309e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.105309e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.929117e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.157594e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.157594e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.810767 sec
+TOTAL       :     2.806226 sec
 INFO: No Floating Point Exceptions have been reported
-     5,374,795,520      cycles                           #    1.909 GHz                    
-     9,285,133,018      instructions                     #    1.73  insn per cycle         
-       2.816377054 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2114) (512y:  282) (512z: 1954)
+     5,500,190,609      cycles                           #    1.954 GHz                    
+     9,401,836,893      instructions                     #    1.71  insn per cycle         
+       2.816415768 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2084) (512y:  282) (512z: 1954)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt
index 6c085aaca8..65dd600686 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-06-28_20:42:05
+DATE: 2024-08-08_20:09:18
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.572873e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.158466e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.276623e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.067308e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.179547e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.276758e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.522810 sec
+TOTAL       :     0.523442 sec
 INFO: No Floating Point Exceptions have been reported
-     2,250,104,415      cycles                           #    2.965 GHz                    
-     3,184,818,143      instructions                     #    1.42  insn per cycle         
-       0.815485646 seconds time elapsed
+     2,203,163,418      cycles                           #    2.923 GHz                    
+     3,173,114,436      instructions                     #    1.44  insn per cycle         
+       0.812619708 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.660610e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.760897e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.760897e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.597347e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.694908e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.694908e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     4.049986 sec
+TOTAL       :     4.173436 sec
 INFO: No Floating Point Exceptions have been reported
-    12,325,342,169      cycles                           #    3.040 GHz                    
-    34,919,769,447      instructions                     #    2.83  insn per cycle         
-       4.055421218 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  443) (avx2:    0) (512y:    0) (512z:    0)
+    12,532,788,513      cycles                           #    2.997 GHz                    
+    35,033,869,738      instructions                     #    2.80  insn per cycle         
+       4.183331959 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  430) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.076627e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.218560e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.218560e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.046469e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.187931e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.187931e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.515029 sec
+TOTAL       :     3.579716 sec
 INFO: No Floating Point Exceptions have been reported
-    10,701,072,922      cycles                           #    3.040 GHz                    
-    23,007,343,701      instructions                     #    2.15  insn per cycle         
-       3.520527508 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2349) (avx2:    0) (512y:    0) (512z:    0)
+    10,790,492,364      cycles                           #    3.007 GHz                    
+    23,124,229,685      instructions                     #    2.14  insn per cycle         
+       3.589416563 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2339) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.085752e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.472257e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.472257e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.059739e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.450926e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.450926e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.171429 sec
+TOTAL       :     2.211695 sec
 INFO: No Floating Point Exceptions have been reported
-     6,197,707,325      cycles                           #    2.848 GHz                    
-    11,955,784,112      instructions                     #    1.93  insn per cycle         
-       2.176803625 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2509) (512y:    0) (512z:    0)
+     6,295,892,975      cycles                           #    2.836 GHz                    
+    12,072,618,893      instructions                     #    1.92  insn per cycle         
+       2.220989978 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2484) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.236227e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.640130e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.640130e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.997474e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.374849e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.374849e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.108665 sec
+TOTAL       :     2.235122 sec
 INFO: No Floating Point Exceptions have been reported
-     6,042,683,756      cycles                           #    2.860 GHz                    
-    11,130,717,499      instructions                     #    1.84  insn per cycle         
-       2.114024511 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2126) (512y:  174) (512z:    0)
+     6,279,000,139      cycles                           #    2.798 GHz                    
+    11,243,252,484      instructions                     #    1.79  insn per cycle         
+       2.244690704 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2095) (512y:  174) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [ha
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.112043e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.358996e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.358996e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.095312e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.342354e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.342354e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.657682 sec
+TOTAL       :     2.697672 sec
 INFO: No Floating Point Exceptions have been reported
-     5,213,529,009      cycles                           #    1.959 GHz                    
-     9,023,567,185      instructions                     #    1.73  insn per cycle         
-       2.663130028 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1650) (512y:  208) (512z: 1570)
+     5,310,077,423      cycles                           #    1.962 GHz                    
+     9,140,837,043      instructions                     #    1.72  insn per cycle         
+       2.707468994 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1620) (512y:  208) (512z: 1570)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
index 3c812282d1..38766f6059 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-06-28_20:18:22
+DATE: 2024-08-08_19:51:32
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.971515e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.203043e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.398328e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.614637e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.196490e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.391083e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086719e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.481274 sec
+TOTAL       :     0.477185 sec
 INFO: No Floating Point Exceptions have been reported
-     2,080,512,591      cycles                           #    2.958 GHz                    
-     2,994,385,775      instructions                     #    1.44  insn per cycle         
-       0.762196231 seconds time elapsed
+     2,083,240,592      cycles                           #    2.927 GHz                    
+     2,954,253,066      instructions                     #    1.42  insn per cycle         
+       0.768394565 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 149
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.000726e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.056957e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.056957e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.972261e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.028190e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.028190e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     5.325245 sec
+TOTAL       :     5.413447 sec
 INFO: No Floating Point Exceptions have been reported
-    16,233,475,504      cycles                           #    3.046 GHz                    
-    45,338,704,501      instructions                     #    2.79  insn per cycle         
-       5.330482823 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  605) (avx2:    0) (512y:    0) (512z:    0)
+    16,298,510,952      cycles                           #    3.008 GHz                    
+    45,383,093,310      instructions                     #    2.78  insn per cycle         
+       5.420499578 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  592) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.688881e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.042825e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.042825e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.516274e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.853993e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.853993e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.322133 sec
+TOTAL       :     2.420950 sec
 INFO: No Floating Point Exceptions have been reported
-     7,081,033,131      cycles                           #    3.044 GHz                    
-    17,775,454,632      instructions                     #    2.51  insn per cycle         
-       2.327454749 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3151) (avx2:    0) (512y:    0) (512z:    0)
+     7,111,183,634      cycles                           #    2.930 GHz                    
+    17,819,948,567      instructions                     #    2.51  insn per cycle         
+       2.427658659 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3133) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.187763e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.285010e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.285010e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.607320e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.824778e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.824778e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.367437 sec
+TOTAL       :     1.317016 sec
 INFO: No Floating Point Exceptions have been reported
-     3,744,621,805      cycles                           #    2.730 GHz                    
-     8,265,608,992      instructions                     #    2.21  insn per cycle         
-       1.372764201 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3374) (512y:    0) (512z:    0)
+     3,802,543,905      cycles                           #    2.874 GHz                    
+     8,308,913,768      instructions                     #    2.19  insn per cycle         
+       1.323729586 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3350) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.188890e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.051258e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.051258e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.087676e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.047463e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.047463e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.225759 sec
+TOTAL       :     1.251137 sec
 INFO: No Floating Point Exceptions have been reported
-     3,547,894,530      cycles                           #    2.883 GHz                    
-     7,919,949,757      instructions                     #    2.23  insn per cycle         
-       1.231119552 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3226) (512y:   20) (512z:    0)
+     3,608,199,910      cycles                           #    2.871 GHz                    
+     7,963,896,839      instructions                     #    2.21  insn per cycle         
+       1.257792419 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3196) (512y:   20) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.887488e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.593324e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.593324e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.851468e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.561768e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.561768e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.606939 sec
+TOTAL       :     1.629029 sec
 INFO: No Floating Point Exceptions have been reported
-     3,254,587,487      cycles                           #    2.020 GHz                    
-     6,098,819,103      instructions                     #    1.87  insn per cycle         
-       1.612213485 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2400) (512y:   24) (512z: 2152)
+     3,306,960,550      cycles                           #    2.023 GHz                    
+     6,143,321,587      instructions                     #    1.86  insn per cycle         
+       1.635836688 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2287) (512y:   24) (512z: 2153)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt
index 8ffba60f10..87c93d2ebd 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-06-28_20:50:45
+DATE: 2024-08-08_20:18:02
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.207285e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.559811e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.559811e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.181597e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.725510e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.725510e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086805e+00 +- 3.414078e-03 )  GeV^0
-TOTAL       :     0.666449 sec
+TOTAL       :     0.672294 sec
 INFO: No Floating Point Exceptions have been reported
-     2,675,605,031      cycles                           #    2.990 GHz                    
-     4,135,337,657      instructions                     #    1.55  insn per cycle         
-       0.951611952 seconds time elapsed
+     2,617,099,456      cycles                           #    2.904 GHz                    
+     4,062,920,786      instructions                     #    1.55  insn per cycle         
+       0.957784001 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -70,8 +70,10 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 149
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -89,20 +91,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.790072e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.842494e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.842494e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.956957e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.011198e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.011198e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     6.016259 sec
+TOTAL       :     5.484325 sec
 INFO: No Floating Point Exceptions have been reported
-    16,448,241,050      cycles                           #    2.861 GHz                    
-    45,389,130,693      instructions                     #    2.76  insn per cycle         
-       6.023243626 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  605) (avx2:    0) (512y:    0) (512z:    0)
+    16,490,289,692      cycles                           #    3.004 GHz                    
+    45,381,699,221      instructions                     #    2.75  insn per cycle         
+       5.490323533 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  592) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -118,20 +121,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.342252e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.671673e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.671673e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.582859e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.920444e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.920444e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.552581 sec
+TOTAL       :     2.418229 sec
 INFO: No Floating Point Exceptions have been reported
-     7,264,559,534      cycles                           #    2.868 GHz                    
-    18,055,626,229      instructions                     #    2.49  insn per cycle         
-       2.558761668 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3151) (avx2:    0) (512y:    0) (512z:    0)
+     7,267,277,115      cycles                           #    2.998 GHz                    
+    18,050,295,436      instructions                     #    2.48  insn per cycle         
+       2.424701000 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3133) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -147,20 +151,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.885868e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.955691e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.955691e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.393268e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.547596e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.547596e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.482805 sec
+TOTAL       :     1.379855 sec
 INFO: No Floating Point Exceptions have been reported
-     3,959,135,121      cycles                           #    2.691 GHz                    
-     8,502,209,023      instructions                     #    2.15  insn per cycle         
-       1.491550395 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3374) (512y:    0) (512z:    0)
+     3,938,588,665      cycles                           #    2.843 GHz                    
+     8,495,556,645      instructions                     #    2.16  insn per cycle         
+       1.386260790 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3350) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -176,20 +181,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.125003e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.251893e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.251893e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.873570e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.014552e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.014552e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.434348 sec
+TOTAL       :     1.313964 sec
 INFO: No Floating Point Exceptions have been reported
-     3,772,960,788      cycles                           #    2.685 GHz                    
-     8,159,229,827      instructions                     #    2.16  insn per cycle         
-       1.441299263 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3226) (512y:   20) (512z:    0)
+     3,770,505,615      cycles                           #    2.857 GHz                    
+     8,157,653,367      instructions                     #    2.16  insn per cycle         
+       1.320625840 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3196) (512y:   20) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -205,20 +211,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.909757e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.503087e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.503087e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.668614e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.340392e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.340392e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.941680 sec
+TOTAL       :     1.706792 sec
 INFO: No Floating Point Exceptions have been reported
-     3,466,655,106      cycles                           #    1.848 GHz                    
-     6,354,787,813      instructions                     #    1.83  insn per cycle         
-       1.949951282 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2400) (512y:   24) (512z: 2152)
+     3,475,092,320      cycles                           #    2.029 GHz                    
+     6,350,458,775      instructions                     #    1.83  insn per cycle         
+       1.713327675 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2287) (512y:   24) (512z: 2153)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt
index 8628d648bd..a8425bb782 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-06-28_21:01:37
+DATE: 2024-08-08_20:29:26
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.877973e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.175805e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.383438e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.044161e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.197356e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.390140e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079446e+00 +- 3.403306e-03 )  GeV^0
-TOTAL       :     0.567054 sec
+TOTAL       :     0.573091 sec
 INFO: No Floating Point Exceptions have been reported
-     2,330,915,980      cycles                           #    2.965 GHz                    
-     3,416,048,789      instructions                     #    1.47  insn per cycle         
-       0.844941386 seconds time elapsed
+     2,302,500,947      cycles                           #    2.899 GHz                    
+     3,359,714,134      instructions                     #    1.46  insn per cycle         
+       0.851330175 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 149
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.006766e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.063518e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.063518e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.971169e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.027848e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.027848e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079573e+00 +- 3.404712e-03 )  GeV^0
-TOTAL       :     5.365120 sec
+TOTAL       :     5.460102 sec
 INFO: No Floating Point Exceptions have been reported
-    16,409,243,058      cycles                           #    3.056 GHz                    
-    45,368,159,236      instructions                     #    2.76  insn per cycle         
-       5.370408040 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  605) (avx2:    0) (512y:    0) (512z:    0)
+    16,412,251,635      cycles                           #    3.004 GHz                    
+    45,363,438,738      instructions                     #    2.76  insn per cycle         
+       5.465223733 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  592) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.694480e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.046550e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.046550e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.639399e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.984668e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.984668e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079572e+00 +- 3.404712e-03 )  GeV^0
-TOTAL       :     2.374059 sec
+TOTAL       :     2.397788 sec
 INFO: No Floating Point Exceptions have been reported
-     7,244,886,908      cycles                           #    3.046 GHz                    
-    17,787,068,117      instructions                     #    2.46  insn per cycle         
-       2.379307738 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3151) (avx2:    0) (512y:    0) (512z:    0)
+     7,225,778,706      cycles                           #    3.008 GHz                    
+    17,780,590,298      instructions                     #    2.46  insn per cycle         
+       2.402807836 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3133) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.623336e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.814218e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.814218e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.542458e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.724935e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.724935e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079550e+00 +- 3.404207e-03 )  GeV^0
-TOTAL       :     1.356912 sec
+TOTAL       :     1.365171 sec
 INFO: No Floating Point Exceptions have been reported
-     3,928,174,846      cycles                           #    2.885 GHz                    
-     8,249,136,865      instructions                     #    2.10  insn per cycle         
-       1.362262336 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3374) (512y:    0) (512z:    0)
+     3,905,630,598      cycles                           #    2.852 GHz                    
+     8,242,044,959      instructions                     #    2.11  insn per cycle         
+       1.370327142 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3350) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.101855e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.043662e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.043662e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.995768e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.031926e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.031926e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079550e+00 +- 3.404207e-03 )  GeV^0
-TOTAL       :     1.293228 sec
+TOTAL       :     1.306099 sec
 INFO: No Floating Point Exceptions have been reported
-     3,737,571,332      cycles                           #    2.880 GHz                    
-     7,870,738,692      instructions                     #    2.11  insn per cycle         
-       1.298462511 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3226) (512y:   20) (512z:    0)
+     3,721,703,946      cycles                           #    2.840 GHz                    
+     7,863,594,201      instructions                     #    2.11  insn per cycle         
+       1.311330370 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3196) (512y:   20) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.862282e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.577950e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.577950e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.758543e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.446976e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.446976e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079550e+00 +- 3.404208e-03 )  GeV^0
-TOTAL       :     1.670875 sec
+TOTAL       :     1.692116 sec
 INFO: No Floating Point Exceptions have been reported
-     3,429,920,435      cycles                           #    2.047 GHz                    
-     6,049,639,626      instructions                     #    1.76  insn per cycle         
-       1.676119731 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2400) (512y:   24) (512z: 2152)
+     3,425,904,021      cycles                           #    2.019 GHz                    
+     6,042,797,691      instructions                     #    1.76  insn per cycle         
+       1.697363173 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2287) (512y:   24) (512z: 2153)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt
index bff07c6868..a9cab1763c 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-06-28_20:58:52
+DATE: 2024-08-08_20:26:38
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.792680e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.180620e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.394308e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.225239e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.197913e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.389129e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086719e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.510503 sec
+TOTAL       :     0.517448 sec
 INFO: No Floating Point Exceptions have been reported
-     2,177,491,273      cycles                           #    2.975 GHz                    
-     3,411,887,204      instructions                     #    1.57  insn per cycle         
-       0.788564891 seconds time elapsed
+     2,112,624,842      cycles                           #    2.859 GHz                    
+     3,317,853,292      instructions                     #    1.57  insn per cycle         
+       0.795716447 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 149
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.987602e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.045188e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.045188e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.922136e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.976186e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.976186e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     5.361967 sec
+TOTAL       :     5.540195 sec
 INFO: No Floating Point Exceptions have been reported
-    16,253,507,490      cycles                           #    3.029 GHz                    
-    45,338,452,684      instructions                     #    2.79  insn per cycle         
-       5.367358373 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  605) (avx2:    0) (512y:    0) (512z:    0)
+    16,275,080,243      cycles                           #    2.936 GHz                    
+    45,337,789,928      instructions                     #    2.79  insn per cycle         
+       5.545390256 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  592) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.531253e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.854760e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.854760e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.488675e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.824628e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.824628e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.399620 sec
+TOTAL       :     2.422437 sec
 INFO: No Floating Point Exceptions have been reported
-     7,075,172,269      cycles                           #    2.944 GHz                    
-    17,774,820,782      instructions                     #    2.51  insn per cycle         
-       2.404849962 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3151) (avx2:    0) (512y:    0) (512z:    0)
+     7,052,758,354      cycles                           #    2.906 GHz                    
+    17,767,509,302      instructions                     #    2.52  insn per cycle         
+       2.427864435 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3133) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.656451e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.855005e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.855005e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.294778e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.430722e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.430722e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.296967 sec
+TOTAL       :     1.350098 sec
 INFO: No Floating Point Exceptions have been reported
-     3,759,549,216      cycles                           #    2.888 GHz                    
-     8,264,701,307      instructions                     #    2.20  insn per cycle         
-       1.302290635 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3374) (512y:    0) (512z:    0)
+     3,737,878,511      cycles                           #    2.759 GHz                    
+     8,257,495,819      instructions                     #    2.21  insn per cycle         
+       1.355605620 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3350) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.175558e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.053477e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.053477e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.700373e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.969590e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.969590e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.226419 sec
+TOTAL       :     1.290488 sec
 INFO: No Floating Point Exceptions have been reported
-     3,556,498,736      cycles                           #    2.889 GHz                    
-     7,920,503,592      instructions                     #    2.23  insn per cycle         
-       1.231741361 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3226) (512y:   20) (512z:    0)
+     3,556,397,958      cycles                           #    2.746 GHz                    
+     7,911,980,107      instructions                     #    2.22  insn per cycle         
+       1.296127398 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3196) (512y:   20) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.953556e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.671434e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.671434e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.356565e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.990428e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.990428e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.592884 sec
+TOTAL       :     1.736165 sec
 INFO: No Floating Point Exceptions have been reported
-     3,261,158,464      cycles                           #    2.042 GHz                    
-     6,099,877,801      instructions                     #    1.87  insn per cycle         
-       1.598103261 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2400) (512y:   24) (512z: 2152)
+     3,256,937,975      cycles                           #    1.871 GHz                    
+     6,093,354,447      instructions                     #    1.87  insn per cycle         
+       1.741565922 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2287) (512y:   24) (512z: 2153)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt
index 3fc0fbbc6c..1b7d56c0f4 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-06-28_20:56:13
+DATE: 2024-08-08_20:23:55
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,22 +50,24 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.903263e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.184841e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.384340e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.925974e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.195417e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.383637e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086805e+00 +- 3.414078e-03 )  GeV^0
-TOTAL       :     0.613170 sec
+TOTAL       :     0.617651 sec
 INFO: No Floating Point Exceptions have been reported
-     2,489,925,895      cycles                           #    2.985 GHz                    
-     3,862,378,401      instructions                     #    1.55  insn per cycle         
-       0.890452214 seconds time elapsed
+     2,472,700,101      cycles                           #    2.956 GHz                    
+     3,844,270,088      instructions                     #    1.55  insn per cycle         
+       0.895131936 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 149
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -82,20 +84,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.005717e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.061976e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.061976e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.959227e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.014297e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.014297e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     5.312002 sec
+TOTAL       :     5.435139 sec
 INFO: No Floating Point Exceptions have been reported
-    16,240,467,534      cycles                           #    3.055 GHz                    
-    45,338,534,619      instructions                     #    2.79  insn per cycle         
-       5.317436580 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  605) (avx2:    0) (512y:    0) (512z:    0)
+    16,264,887,736      cycles                           #    2.990 GHz                    
+    45,334,381,661      instructions                     #    2.79  insn per cycle         
+       5.440210307 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  592) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -110,20 +113,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.653354e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.006022e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.006022e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.519066e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.848466e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.848466e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.338600 sec
+TOTAL       :     2.405682 sec
 INFO: No Floating Point Exceptions have been reported
-     7,086,830,161      cycles                           #    3.025 GHz                    
-    17,776,494,030      instructions                     #    2.51  insn per cycle         
-       2.344006781 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3151) (avx2:    0) (512y:    0) (512z:    0)
+     7,056,903,182      cycles                           #    2.928 GHz                    
+    17,767,514,446      instructions                     #    2.52  insn per cycle         
+       2.410973137 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3133) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -138,20 +142,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.547528e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.715398e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.715398e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.565756e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.749553e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.749553e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.312500 sec
+TOTAL       :     1.305436 sec
 INFO: No Floating Point Exceptions have been reported
-     3,749,385,603      cycles                           #    2.846 GHz                    
-     8,264,672,474      instructions                     #    2.20  insn per cycle         
-       1.317884402 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3374) (512y:    0) (512z:    0)
+     3,753,143,327      cycles                           #    2.865 GHz                    
+     8,257,983,801      instructions                     #    2.20  insn per cycle         
+       1.310628316 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3350) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -166,20 +171,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.067824e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.042356e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.042356e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.040312e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.036836e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.036836e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.242601 sec
+TOTAL       :     1.242569 sec
 INFO: No Floating Point Exceptions have been reported
-     3,563,693,975      cycles                           #    2.857 GHz                    
-     7,919,771,531      instructions                     #    2.22  insn per cycle         
-       1.248059174 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3226) (512y:   20) (512z:    0)
+     3,552,004,540      cycles                           #    2.848 GHz                    
+     7,912,724,917      instructions                     #    2.23  insn per cycle         
+       1.247741947 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3196) (512y:   20) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -194,20 +200,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.790143e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.492603e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.492603e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.813901e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.506813e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.506813e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.632153 sec
+TOTAL       :     1.621227 sec
 INFO: No Floating Point Exceptions have been reported
-     3,268,889,948      cycles                           #    1.997 GHz                    
-     6,099,013,155      instructions                     #    1.87  insn per cycle         
-       1.637536774 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2400) (512y:   24) (512z: 2152)
+     3,253,421,004      cycles                           #    2.002 GHz                    
+     6,092,602,588      instructions                     #    1.87  insn per cycle         
+       1.626390565 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2287) (512y:   24) (512z: 2153)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt
index 0c09829995..613986d3ca 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-06-28_20:18:42
+DATE: 2024-08-08_19:51:52
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.541434e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.458470e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.707090e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.011234e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.481106e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.718662e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086719e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.479969 sec
+TOTAL       :     0.482144 sec
 INFO: No Floating Point Exceptions have been reported
-     2,105,563,726      cycles                           #    2.968 GHz                    
-     3,011,385,971      instructions                     #    1.43  insn per cycle         
-       0.766621397 seconds time elapsed
+     2,069,508,701      cycles                           #    2.943 GHz                    
+     2,973,558,730      instructions                     #    1.44  insn per cycle         
+       0.762169669 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.034864e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.093151e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.093151e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.000971e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.057776e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.057776e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     5.237994 sec
+TOTAL       :     5.337569 sec
 INFO: No Floating Point Exceptions have been reported
-    15,955,011,989      cycles                           #    3.044 GHz                    
-    44,449,172,061      instructions                     #    2.79  insn per cycle         
-       5.243459295 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  550) (avx2:    0) (512y:    0) (512z:    0)
+    16,045,528,009      cycles                           #    3.003 GHz                    
+    44,492,603,616      instructions                     #    2.77  insn per cycle         
+       5.344572857 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  537) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.519058e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.010085e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.010085e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.399267e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.870292e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.870292e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     1.985271 sec
+TOTAL       :     2.040967 sec
 INFO: No Floating Point Exceptions have been reported
-     6,065,760,073      cycles                           #    3.048 GHz                    
-    17,080,403,379      instructions                     #    2.82  insn per cycle         
-       1.990555127 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2882) (avx2:    0) (512y:    0) (512z:    0)
+     6,120,195,211      cycles                           #    2.990 GHz                    
+    17,124,524,771      instructions                     #    2.80  insn per cycle         
+       2.047704691 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2864) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.223952e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.826129e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.826129e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.231646e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.843621e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.843621e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.771606 sec
+TOTAL       :     1.779814 sec
 INFO: No Floating Point Exceptions have been reported
-     5,023,238,217      cycles                           #    2.828 GHz                    
-    10,229,394,821      instructions                     #    2.04  insn per cycle         
-       1.777232545 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3917) (512y:    0) (512z:    0)
+     5,080,547,059      cycles                           #    2.845 GHz                    
+    10,273,415,072      instructions                     #    2.02  insn per cycle         
+       1.786648263 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3893) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.342994e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.956517e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.956517e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.292968e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.928983e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.928983e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.736711 sec
+TOTAL       :     1.763357 sec
 INFO: No Floating Point Exceptions have been reported
-     4,975,198,217      cycles                           #    2.858 GHz                    
-     9,999,864,692      instructions                     #    2.01  insn per cycle         
-       1.741779350 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3824) (512y:    2) (512z:    0)
+     5,036,199,960      cycles                           #    2.847 GHz                    
+    10,043,698,662      instructions                     #    1.99  insn per cycle         
+       1.770080531 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3794) (512y:    2) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.869806e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.213933e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.213933e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.908901e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.261898e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.261898e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     2.237189 sec
+TOTAL       :     2.233509 sec
 INFO: No Floating Point Exceptions have been reported
-     4,364,643,529      cycles                           #    1.947 GHz                    
-     8,448,269,915      instructions                     #    1.94  insn per cycle         
-       2.242515799 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2897) (512y:    4) (512z: 2751)
+     4,417,373,079      cycles                           #    1.973 GHz                    
+     8,493,082,992      instructions                     #    1.92  insn per cycle         
+       2.240143434 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2784) (512y:    4) (512z: 2752)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt
index 7c0eb4ece3..0ca4814912 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-06-28_20:42:27
+DATE: 2024-08-08_20:09:41
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.723565e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.180139e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.395798e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.662526e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.213312e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.395769e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086719e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.483814 sec
+TOTAL       :     0.479336 sec
 INFO: No Floating Point Exceptions have been reported
-     2,086,603,150      cycles                           #    2.947 GHz                    
-     2,996,291,742      instructions                     #    1.44  insn per cycle         
-       0.766644042 seconds time elapsed
+     2,068,711,068      cycles                           #    2.929 GHz                    
+     2,952,499,501      instructions                     #    1.43  insn per cycle         
+       0.763196119 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 149
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.588532e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.684127e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.684127e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.557673e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.652343e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.652343e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     4.135203 sec
+TOTAL       :     4.192940 sec
 INFO: No Floating Point Exceptions have been reported
-    12,570,266,885      cycles                           #    3.037 GHz                    
-    34,622,435,481      instructions                     #    2.75  insn per cycle         
-       4.140528435 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  696) (avx2:    0) (512y:    0) (512z:    0)
+    12,602,357,038      cycles                           #    3.002 GHz                    
+    34,631,326,432      instructions                     #    2.75  insn per cycle         
+       4.199620510 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  683) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.480340e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.970700e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.970700e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.457087e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.945109e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.945109e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     1.999581 sec
+TOTAL       :     2.017495 sec
 INFO: No Floating Point Exceptions have been reported
-     6,075,791,171      cycles                           #    3.032 GHz                    
-    14,848,680,325      instructions                     #    2.44  insn per cycle         
-       2.005001414 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2993) (avx2:    0) (512y:    0) (512z:    0)
+     6,096,552,375      cycles                           #    3.013 GHz                    
+    14,886,527,681      instructions                     #    2.44  insn per cycle         
+       2.024226195 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2980) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.412557e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.265368e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.265368e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.320703e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.178361e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.178361e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.500086 sec
+TOTAL       :     1.525431 sec
 INFO: No Floating Point Exceptions have been reported
-     4,313,416,167      cycles                           #    2.867 GHz                    
-     9,055,140,923      instructions                     #    2.10  insn per cycle         
-       1.505373819 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4470) (512y:    0) (512z:    0)
+     4,362,864,395      cycles                           #    2.849 GHz                    
+     9,093,170,699      instructions                     #    2.08  insn per cycle         
+       1.532091223 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4446) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.580624e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.471236e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.471236e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.442008e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.347351e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.347351e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.467600 sec
+TOTAL       :     1.505548 sec
 INFO: No Floating Point Exceptions have been reported
-     4,192,369,884      cycles                           #    2.848 GHz                    
-     8,664,912,975      instructions                     #    2.07  insn per cycle         
-       1.473038348 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4243) (512y:    0) (512z:    0)
+     4,283,778,078      cycles                           #    2.834 GHz                    
+     8,707,570,636      instructions                     #    2.03  insn per cycle         
+       1.512346731 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4213) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.697028e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.182026e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.182026e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.480199e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.987074e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.987074e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.926016 sec
+TOTAL       :     2.010348 sec
 INFO: No Floating Point Exceptions have been reported
-     3,834,269,127      cycles                           #    1.986 GHz                    
-     7,807,487,724      instructions                     #    2.04  insn per cycle         
-       1.931378267 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4424) (512y:    0) (512z: 2555)
+     3,921,508,341      cycles                           #    1.945 GHz                    
+     7,849,973,775      instructions                     #    2.00  insn per cycle         
+       2.017051814 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4252) (512y:    0) (512z: 2556)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt
index e900e27558..c66a4f9500 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-06-28_20:42:47
+DATE: 2024-08-08_20:10:00
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.140621e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.444301e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.723845e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.014498e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.491996e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.727921e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086719e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.479825 sec
+TOTAL       :     0.481358 sec
 INFO: No Floating Point Exceptions have been reported
-     2,082,597,398      cycles                           #    2.951 GHz                    
-     2,989,924,480      instructions                     #    1.44  insn per cycle         
-       0.762031602 seconds time elapsed
+     2,037,978,515      cycles                           #    2.886 GHz                    
+     2,961,010,767      instructions                     #    1.45  insn per cycle         
+       0.762837811 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.739102e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.845252e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.845252e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.697323e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.802206e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.802206e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     3.913257 sec
+TOTAL       :     3.980371 sec
 INFO: No Floating Point Exceptions have been reported
-    11,819,760,720      cycles                           #    3.017 GHz                    
-    35,092,176,344      instructions                     #    2.97  insn per cycle         
-       3.918717089 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  466) (avx2:    0) (512y:    0) (512z:    0)
+    11,889,490,017      cycles                           #    2.983 GHz                    
+    35,106,748,392      instructions                     #    2.95  insn per cycle         
+       3.987184887 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  453) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.569207e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.078523e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.078523e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.502653e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.994079e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.994079e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     1.968652 sec
+TOTAL       :     1.999831 sec
 INFO: No Floating Point Exceptions have been reported
-     5,956,774,806      cycles                           #    3.019 GHz                    
-    14,470,057,994      instructions                     #    2.43  insn per cycle         
-       1.974105545 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2572) (avx2:    0) (512y:    0) (512z:    0)
+     5,999,305,364      cycles                           #    2.992 GHz                    
+    14,506,447,484      instructions                     #    2.42  insn per cycle         
+       2.006483206 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2559) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.384560e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.251377e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.251377e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.608204e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.550220e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.550220e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.505860 sec
+TOTAL       :     1.473214 sec
 INFO: No Floating Point Exceptions have been reported
-     4,150,238,324      cycles                           #    2.747 GHz                    
-     8,882,886,534      instructions                     #    2.14  insn per cycle         
-       1.511296107 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3580) (512y:    0) (512z:    0)
+     4,213,841,990      cycles                           #    2.849 GHz                    
+     8,921,034,070      instructions                     #    2.12  insn per cycle         
+       1.479975021 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3556) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.444581e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.341630e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.341630e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.485226e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.400149e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.400149e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.495867 sec
+TOTAL       :     1.496613 sec
 INFO: No Floating Point Exceptions have been reported
-     4,109,698,688      cycles                           #    2.739 GHz                    
-     8,410,471,582      instructions                     #    2.05  insn per cycle         
-       1.501436955 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3314) (512y:    0) (512z:    0)
+     4,261,968,497      cycles                           #    2.836 GHz                    
+     8,450,409,335      instructions                     #    1.98  insn per cycle         
+       1.503441367 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3284) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [ha
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.798030e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.297222e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.297222e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.731827e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.224198e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.224198e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.893929 sec
+TOTAL       :     1.924845 sec
 INFO: No Floating Point Exceptions have been reported
-     3,787,882,654      cycles                           #    1.996 GHz                    
-     7,701,397,342      instructions                     #    2.03  insn per cycle         
-       1.899199723 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3440) (512y:    0) (512z: 2107)
+     3,821,108,888      cycles                           #    1.979 GHz                    
+     7,740,611,821      instructions                     #    2.03  insn per cycle         
+       1.931585644 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3268) (512y:    0) (512z: 2108)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
index 1d2c49ac8a..9e258a42c8 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-06-28_20:19:03
+DATE: 2024-08-08_19:52:14
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.880314e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.172219e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.274853e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.928215e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.172881e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.273641e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.520012 sec
+TOTAL       :     0.521950 sec
 INFO: No Floating Point Exceptions have been reported
-     2,202,911,416      cycles                           #    2.926 GHz                    
-     3,165,598,515      instructions                     #    1.44  insn per cycle         
-       0.810414735 seconds time elapsed
+     2,213,686,839      cycles                           #    2.946 GHz                    
+     3,178,577,075      instructions                     #    1.44  insn per cycle         
+       0.810096796 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.860309e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.907494e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.907494e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.841341e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.888035e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.888035e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.743741 sec
+TOTAL       :     5.832684 sec
 INFO: No Floating Point Exceptions have been reported
-    17,418,994,782      cycles                           #    3.030 GHz                    
-    46,094,189,242      instructions                     #    2.65  insn per cycle         
-       5.749395250 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  631) (avx2:    0) (512y:    0) (512z:    0)
+    17,545,887,667      cycles                           #    3.004 GHz                    
+    46,212,560,657      instructions                     #    2.63  insn per cycle         
+       5.842093812 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  618) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.315434e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.480321e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.480321e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.270852e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.438233e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.438233e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.270556 sec
+TOTAL       :     3.344937 sec
 INFO: No Floating Point Exceptions have been reported
-     9,955,949,401      cycles                           #    3.040 GHz                    
-    27,592,161,577      instructions                     #    2.77  insn per cycle         
-       3.275977650 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2593) (avx2:    0) (512y:    0) (512z:    0)
+    10,073,495,315      cycles                           #    3.004 GHz                    
+    27,713,045,845      instructions                     #    2.75  insn per cycle         
+       3.354389607 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2581) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.259072e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.676561e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.676561e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.229785e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.644944e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.644944e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.101748 sec
+TOTAL       :     2.142226 sec
 INFO: No Floating Point Exceptions have been reported
-     6,038,149,520      cycles                           #    2.866 GHz                    
-    12,488,773,306      instructions                     #    2.07  insn per cycle         
-       2.107527607 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2780) (512y:    0) (512z:    0)
+     6,138,817,492      cycles                           #    2.854 GHz                    
+    12,602,197,399      instructions                     #    2.05  insn per cycle         
+       2.151581868 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2762) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.800248e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.308228e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.308228e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.722165e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.222047e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.222047e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     1.915933 sec
+TOTAL       :     1.971142 sec
 INFO: No Floating Point Exceptions have been reported
-     5,504,033,479      cycles                           #    2.866 GHz                    
-    11,922,925,036      instructions                     #    2.17  insn per cycle         
-       1.921444713 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2531) (512y:  146) (512z:    0)
+     5,621,798,133      cycles                           #    2.839 GHz                    
+    12,035,423,234      instructions                     #    2.14  insn per cycle         
+       1.980714349 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2507) (512y:  146) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.762973e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.968383e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.968383e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.784432e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.992571e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.992571e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.894520 sec
+TOTAL       :     2.909360 sec
 INFO: No Floating Point Exceptions have been reported
-     5,602,974,880      cycles                           #    1.933 GHz                    
-     8,113,600,641      instructions                     #    1.45  insn per cycle         
-       2.900011376 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1668) (512y:  126) (512z: 1862)
+     5,725,311,509      cycles                           #    1.962 GHz                    
+     8,228,178,315      instructions                     #    1.44  insn per cycle         
+       2.919447921 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1646) (512y:  126) (512z: 1862)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt
index 29bff9cc44..0491e4ed6d 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-06-28_20:19:27
+DATE: 2024-08-08_19:52:38
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.909940e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.182050e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.286288e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.017343e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.179179e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.286659e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.521684 sec
+TOTAL       :     0.519682 sec
 INFO: No Floating Point Exceptions have been reported
-     2,226,569,565      cycles                           #    2.949 GHz                    
-     3,203,343,455      instructions                     #    1.44  insn per cycle         
-       0.811937332 seconds time elapsed
+     2,213,688,235      cycles                           #    2.946 GHz                    
+     3,194,056,853      instructions                     #    1.44  insn per cycle         
+       0.808260316 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.889937e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.939639e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.939639e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.869136e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.918050e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.918050e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.656141 sec
+TOTAL       :     5.752898 sec
 INFO: No Floating Point Exceptions have been reported
-    16,953,958,373      cycles                           #    2.995 GHz                    
-    45,121,214,574      instructions                     #    2.66  insn per cycle         
-       5.661842751 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  582) (avx2:    0) (512y:    0) (512z:    0)
+    17,074,104,828      cycles                           #    2.963 GHz                    
+    45,236,287,915      instructions                     #    2.65  insn per cycle         
+       5.764326274 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  569) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.455685e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.638788e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.638788e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.441463e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.626872e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.626872e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.140773 sec
+TOTAL       :     3.185909 sec
 INFO: No Floating Point Exceptions have been reported
-     9,518,063,498      cycles                           #    3.026 GHz                    
-    26,244,492,434      instructions                     #    2.76  insn per cycle         
-       3.146259863 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2397) (avx2:    0) (512y:    0) (512z:    0)
+     9,649,087,118      cycles                           #    3.020 GHz                    
+    26,365,137,437      instructions                     #    2.73  insn per cycle         
+       3.195361891 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2385) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.657640e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.979119e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.979119e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.613455e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.935335e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.935335e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.357004 sec
+TOTAL       :     2.413480 sec
 INFO: No Floating Point Exceptions have been reported
-     6,725,665,752      cycles                           #    2.848 GHz                    
-    14,035,144,203      instructions                     #    2.09  insn per cycle         
-       2.362636169 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2902) (512y:    0) (512z:    0)
+     6,867,786,043      cycles                           #    2.835 GHz                    
+    14,147,220,960      instructions                     #    2.06  insn per cycle         
+       2.423178008 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2884) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.936976e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.299377e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.299377e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.856156e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.210888e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.210888e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.229512 sec
+TOTAL       :     2.298392 sec
 INFO: No Floating Point Exceptions have been reported
-     6,396,564,093      cycles                           #    2.863 GHz                    
-    13,527,050,240      instructions                     #    2.11  insn per cycle         
-       2.234879866 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2547) (512y:  302) (512z:    0)
+     6,526,789,768      cycles                           #    2.829 GHz                    
+    13,640,691,375      instructions                     #    2.09  insn per cycle         
+       2.307759550 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2523) (512y:  302) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [ha
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.800963e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.008328e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.008328e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.731216e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.937483e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.937483e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.864981 sec
+TOTAL       :     2.951920 sec
 INFO: No Floating Point Exceptions have been reported
-     5,622,466,186      cycles                           #    1.960 GHz                    
-     9,214,490,312      instructions                     #    1.64  insn per cycle         
-       2.870602335 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1453) (512y:  212) (512z: 2059)
+     5,713,181,383      cycles                           #    1.930 GHz                    
+     9,325,302,677      instructions                     #    1.63  insn per cycle         
+       2.961562881 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1431) (512y:  212) (512z: 2059)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
index 800b42a2f7..f4571b9f6b 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2024-06-28_20:19:52
+DATE: 2024-08-08_19:53:03
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.784849e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.050628e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.064964e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.927019e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.050993e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.064681e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.469118 sec
+TOTAL       :     0.466363 sec
 INFO: No Floating Point Exceptions have been reported
-     2,006,582,876      cycles                           #    2.941 GHz                    
-     2,882,205,133      instructions                     #    1.44  insn per cycle         
-       0.742141272 seconds time elapsed
+     2,031,704,885      cycles                           #    2.932 GHz                    
+     2,907,931,480      instructions                     #    1.43  insn per cycle         
+       0.749954927 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.108512e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.323620e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.336186e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.108955e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.322519e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.334742e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.603409 sec
+TOTAL       :     0.601379 sec
 INFO: No Floating Point Exceptions have been reported
-     2,505,659,997      cycles                           #    2.953 GHz                    
-     3,807,332,090      instructions                     #    1.52  insn per cycle         
-       0.909344488 seconds time elapsed
+     2,455,141,462      cycles                           #    2.938 GHz                    
+     3,762,396,340      instructions                     #    1.53  insn per cycle         
+       0.893863333 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.487353e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.499535e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.499535e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.481232e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.493616e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.493616e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.610434 sec
+TOTAL       :     6.623962 sec
 INFO: No Floating Point Exceptions have been reported
-    19,883,753,596      cycles                           #    3.007 GHz                    
-    59,920,127,931      instructions                     #    3.01  insn per cycle         
-       6.614643893 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1212) (avx2:    0) (512y:    0) (512z:    0)
+    19,900,544,736      cycles                           #    3.003 GHz                    
+    59,917,689,995      instructions                     #    3.01  insn per cycle         
+       6.628146634 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1199) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.544552e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.585762e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.585762e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.692821e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.734716e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.734716e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.630161 sec
+TOTAL       :     3.511000 sec
 INFO: No Floating Point Exceptions have been reported
-    10,622,833,743      cycles                           #    2.926 GHz                    
-    31,096,080,168      instructions                     #    2.93  insn per cycle         
-       3.634294540 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 5233) (avx2:    0) (512y:    0) (512z:    0)
+    10,573,188,323      cycles                           #    3.009 GHz                    
+    31,088,228,992      instructions                     #    2.94  insn per cycle         
+       3.514850116 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 5221) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.447536e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.617462e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.617462e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.311594e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.480158e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.480158e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.756614 sec
+TOTAL       :     1.779751 sec
 INFO: No Floating Point Exceptions have been reported
-     4,997,708,956      cycles                           #    2.841 GHz                    
-    11,412,697,579      instructions                     #    2.28  insn per cycle         
-       1.763192357 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4653) (512y:    0) (512z:    0)
+     4,993,361,094      cycles                           #    2.801 GHz                    
+    11,406,864,540      instructions                     #    2.28  insn per cycle         
+       1.783592873 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4635) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.046283e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.066541e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.066541e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.047569e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.068559e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.068559e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.588128 sec
+TOTAL       :     1.583863 sec
 INFO: No Floating Point Exceptions have been reported
-     4,445,949,332      cycles                           #    2.794 GHz                    
-    10,670,541,390      instructions                     #    2.40  insn per cycle         
-       1.592278823 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4395) (512y:   91) (512z:    0)
+     4,443,684,141      cycles                           #    2.800 GHz                    
+    10,665,267,804      instructions                     #    2.40  insn per cycle         
+       1.587769074 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4371) (512y:   91) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.399503e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.504160e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.504160e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.461711e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.569260e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.569260e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.239033 sec
+TOTAL       :     2.218169 sec
 INFO: No Floating Point Exceptions have been reported
-     4,137,934,138      cycles                           #    1.845 GHz                    
-     5,973,596,045      instructions                     #    1.44  insn per cycle         
-       2.243189213 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1629) (512y:   95) (512z: 3576)
+     4,131,467,216      cycles                           #    1.860 GHz                    
+     5,968,009,062      instructions                     #    1.44  insn per cycle         
+       2.222079730 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1605) (512y:   95) (512z: 3576)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt
index 511a1a868d..a42937504e 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2024-06-28_20:51:08
+DATE: 2024-08-08_20:18:23
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.590014e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.672492e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.672492e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.687469e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.986061e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.986061e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.500146 sec
+TOTAL       :     0.493096 sec
 INFO: No Floating Point Exceptions have been reported
-     2,070,151,910      cycles                           #    2.909 GHz                    
-     3,149,644,102      instructions                     #    1.52  insn per cycle         
-       0.770025396 seconds time elapsed
+     2,045,059,008      cycles                           #    2.898 GHz                    
+     3,097,048,003      instructions                     #    1.51  insn per cycle         
+       0.762660564 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -79,18 +79,20 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.719854e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.828063e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.828063e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.805866e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.910227e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.910227e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.820483 sec
+TOTAL       :     0.818307 sec
 INFO: No Floating Point Exceptions have been reported
-     3,187,057,338      cycles                           #    2.976 GHz                    
-     5,022,211,168      instructions                     #    1.58  insn per cycle         
-       1.131275813 seconds time elapsed
+     3,140,684,454      cycles                           #    2.950 GHz                    
+     5,061,508,169      instructions                     #    1.61  insn per cycle         
+       1.128278285 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -108,20 +110,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.524282e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.536797e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.536797e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.492873e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.505187e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.505187e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.520579 sec
+TOTAL       :     6.599351 sec
 INFO: No Floating Point Exceptions have been reported
-    19,909,517,926      cycles                           #    3.052 GHz                    
-    59,928,033,518      instructions                     #    3.01  insn per cycle         
-       6.524962185 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1212) (avx2:    0) (512y:    0) (512z:    0)
+    19,933,005,895      cycles                           #    3.019 GHz                    
+    59,920,307,427      instructions                     #    3.01  insn per cycle         
+       6.603770814 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1199) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -137,20 +140,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.686960e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.730010e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.730010e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.695185e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.737821e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.737821e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.526110 sec
+TOTAL       :     3.515055 sec
 INFO: No Floating Point Exceptions have been reported
-    10,658,484,150      cycles                           #    3.020 GHz                    
-    31,144,317,033      instructions                     #    2.92  insn per cycle         
-       3.530650848 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 5233) (avx2:    0) (512y:    0) (512z:    0)
+    10,602,064,942      cycles                           #    3.013 GHz                    
+    31,134,275,582      instructions                     #    2.94  insn per cycle         
+       3.519385575 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 5221) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -166,20 +170,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.376823e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.547869e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.547869e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.301392e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.470755e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.470755e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.778728 sec
+TOTAL       :     1.788543 sec
 INFO: No Floating Point Exceptions have been reported
-     5,043,044,785      cycles                           #    2.829 GHz                    
-    11,463,079,799      instructions                     #    2.27  insn per cycle         
-       1.783508677 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4653) (512y:    0) (512z:    0)
+     5,028,204,629      cycles                           #    2.805 GHz                    
+    11,455,559,201      instructions                     #    2.28  insn per cycle         
+       1.792981978 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4635) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -195,20 +200,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.067257e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.088583e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.088583e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.050919e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.072418e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.072418e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.565555 sec
+TOTAL       :     1.585500 sec
 INFO: No Floating Point Exceptions have been reported
-     4,485,484,739      cycles                           #    2.858 GHz                    
-    10,721,399,115      instructions                     #    2.39  insn per cycle         
-       1.570106895 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4395) (512y:   91) (512z:    0)
+     4,477,945,053      cycles                           #    2.818 GHz                    
+    10,713,475,732      instructions                     #    2.39  insn per cycle         
+       1.589826674 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4371) (512y:   91) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -224,20 +230,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.460504e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.573111e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.573111e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.347709e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.453074e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.453074e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.229247 sec
+TOTAL       :     2.257984 sec
 INFO: No Floating Point Exceptions have been reported
-     4,189,282,452      cycles                           #    1.876 GHz                    
-     6,014,301,160      instructions                     #    1.44  insn per cycle         
-       2.233707766 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1629) (512y:   95) (512z: 3576)
+     4,161,878,306      cycles                           #    1.840 GHz                    
+     6,004,301,884      instructions                     #    1.44  insn per cycle         
+       2.262398569 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1605) (512y:   95) (512z: 3576)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt
index 41d4b9b4cc..6efe0f69f4 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2024-06-28_20:20:17
+DATE: 2024-08-08_19:53:28
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.764297e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.047002e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.061437e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.841089e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.040503e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.053751e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.473798 sec
+TOTAL       :     0.462910 sec
 INFO: No Floating Point Exceptions have been reported
-     1,951,625,584      cycles                           #    2.812 GHz                    
-     2,796,879,640      instructions                     #    1.43  insn per cycle         
-       0.755787707 seconds time elapsed
+     2,010,149,699      cycles                           #    2.952 GHz                    
+     2,896,854,048      instructions                     #    1.44  insn per cycle         
+       0.738052118 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.103167e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.314097e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.326586e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.107639e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.318401e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.329750e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.594021 sec
+TOTAL       :     0.598813 sec
 INFO: No Floating Point Exceptions have been reported
-     2,462,107,240      cycles                           #    2.964 GHz                    
-     3,775,116,185      instructions                     #    1.53  insn per cycle         
-       0.889360809 seconds time elapsed
+     2,457,830,026      cycles                           #    2.951 GHz                    
+     3,751,049,656      instructions                     #    1.53  insn per cycle         
+       0.893099521 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.505210e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.517375e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.517375e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.489979e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.502462e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.502462e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.563107 sec
+TOTAL       :     6.600482 sec
 INFO: No Floating Point Exceptions have been reported
-    19,911,970,043      cycles                           #    3.032 GHz                    
-    60,133,794,458      instructions                     #    3.02  insn per cycle         
-       6.567254097 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1335) (avx2:    0) (512y:    0) (512z:    0)
+    19,968,279,527      cycles                           #    3.024 GHz                    
+    60,133,262,996      instructions                     #    3.01  insn per cycle         
+       6.604278291 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1322) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.806305e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.849646e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.849646e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.723867e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.766716e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.766716e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.430797 sec
+TOTAL       :     3.487862 sec
 INFO: No Floating Point Exceptions have been reported
-    10,444,425,353      cycles                           #    3.041 GHz                    
-    30,694,173,313      instructions                     #    2.94  insn per cycle         
-       3.435027589 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 5059) (avx2:    0) (512y:    0) (512z:    0)
+    10,481,040,414      cycles                           #    3.003 GHz                    
+    30,690,087,380      instructions                     #    2.93  insn per cycle         
+       3.491637208 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 5047) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.172198e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.332764e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.332764e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.840811e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.994004e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.994004e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.808464 sec
+TOTAL       :     1.873663 sec
 INFO: No Floating Point Exceptions have been reported
-     5,137,237,567      cycles                           #    2.835 GHz                    
-    11,845,239,071      instructions                     #    2.31  insn per cycle         
-       1.812718837 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4759) (512y:    0) (512z:    0)
+     5,129,466,442      cycles                           #    2.733 GHz                    
+    11,839,868,923      instructions                     #    2.31  insn per cycle         
+       1.877504725 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4741) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.884560e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.007012e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.007012e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.982969e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.017062e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.017062e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.680647 sec
+TOTAL       :     1.660972 sec
 INFO: No Floating Point Exceptions have been reported
-     4,727,101,217      cycles                           #    2.807 GHz                    
-    11,170,633,130      instructions                     #    2.36  insn per cycle         
-       1.687326475 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4420) (512y:  245) (512z:    0)
+     4,713,444,499      cycles                           #    2.833 GHz                    
+    11,164,953,266      instructions                     #    2.37  insn per cycle         
+       1.664821518 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4396) (512y:  245) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.212331e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.313838e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.313838e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.457192e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.563104e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.563104e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.296456 sec
+TOTAL       :     2.218804 sec
 INFO: No Floating Point Exceptions have been reported
-     4,164,500,139      cycles                           #    1.811 GHz                    
-     6,225,483,611      instructions                     #    1.49  insn per cycle         
-       2.302194133 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1525) (512y:  140) (512z: 3678)
+     4,152,440,872      cycles                           #    1.869 GHz                    
+     6,219,243,593      instructions                     #    1.50  insn per cycle         
+       2.222530673 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1501) (512y:  140) (512z: 3678)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
index 31a9a4ac3b..f6f4702d8b 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2024-06-28_20:20:42
+DATE: 2024-08-08_19:53:53
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.235609e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.904693e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.979120e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.320062e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.967518e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.041410e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008472e+02 +- 5.002447e+01 )  GeV^-2
-TOTAL       :     0.449760 sec
+TOTAL       :     0.444288 sec
 INFO: No Floating Point Exceptions have been reported
-     1,987,597,220      cycles                           #    2.945 GHz                    
-     2,797,236,541      instructions                     #    1.41  insn per cycle         
-       0.739644538 seconds time elapsed
+     1,959,595,734      cycles                           #    2.963 GHz                    
+     2,777,994,587      instructions                     #    1.42  insn per cycle         
+       0.717899732 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 227
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.995131e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.899194e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.956559e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.069470e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.919373e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.975617e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.630097e+02 +- 4.770717e+02 )  GeV^-2
-TOTAL       :     0.496488 sec
+TOTAL       :     0.495533 sec
 INFO: No Floating Point Exceptions have been reported
-     2,159,378,990      cycles                           #    2.950 GHz                    
-     3,070,658,648      instructions                     #    1.42  insn per cycle         
-       0.788842582 seconds time elapsed
+     2,156,454,732      cycles                           #    2.941 GHz                    
+     3,086,518,049      instructions                     #    1.43  insn per cycle         
+       0.790560540 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.597401e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.610500e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.610500e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.572191e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.585337e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.585337e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002643e+01 )  GeV^-2
-TOTAL       :     6.330423 sec
+TOTAL       :     6.388092 sec
 INFO: No Floating Point Exceptions have been reported
-    19,241,703,454      cycles                           #    3.039 GHz                    
-    59,623,316,700      instructions                     #    3.10  insn per cycle         
-       6.335537576 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  972) (avx2:    0) (512y:    0) (512z:    0)
+    19,202,614,309      cycles                           #    3.005 GHz                    
+    59,612,894,743      instructions                     #    3.10  insn per cycle         
+       6.392159520 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  959) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.293498e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.432828e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.432828e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.292655e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.433094e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.433094e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002643e+01 )  GeV^-2
-TOTAL       :     1.996199 sec
+TOTAL       :     1.992839 sec
 INFO: No Floating Point Exceptions have been reported
-     6,021,787,246      cycles                           #    3.012 GHz                    
-    17,069,232,964      instructions                     #    2.83  insn per cycle         
-       2.002865216 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 5867) (avx2:    0) (512y:    0) (512z:    0)
+     6,013,924,550      cycles                           #    3.013 GHz                    
+    17,061,326,868      instructions                     #    2.84  insn per cycle         
+       1.996457314 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 5855) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.803557e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.866475e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.866475e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.800495e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.863232e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.863232e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008857e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     0.928687 sec
+TOTAL       :     0.927310 sec
 INFO: No Floating Point Exceptions have been reported
-     2,639,576,993      cycles                           #    2.831 GHz                    
-     6,193,698,724      instructions                     #    2.35  insn per cycle         
-       0.934212919 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5109) (512y:    0) (512z:    0)
+     2,629,891,219      cycles                           #    2.827 GHz                    
+     6,187,073,232      instructions                     #    2.35  insn per cycle         
+       0.930846209 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5091) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.001387e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.081356e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.081356e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.976191e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.051455e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.051455e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008857e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     0.839723 sec
+TOTAL       :     0.846370 sec
 INFO: No Floating Point Exceptions have been reported
-     2,406,487,036      cycles                           #    2.857 GHz                    
-     5,797,568,703      instructions                     #    2.41  insn per cycle         
-       0.844635688 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4920) (512y:   36) (512z:    0)
+     2,395,634,403      cycles                           #    2.821 GHz                    
+     5,790,356,055      instructions                     #    2.42  insn per cycle         
+       0.849905167 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4896) (512y:   36) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.514820e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.561506e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.561506e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.518605e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.563959e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.563959e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008856e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     1.105903 sec
+TOTAL       :     1.098394 sec
 INFO: No Floating Point Exceptions have been reported
-     2,082,755,723      cycles                           #    1.879 GHz                    
-     3,398,336,985      instructions                     #    1.63  insn per cycle         
-       1.112213871 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2238) (512y:   39) (512z: 3787)
+     2,076,123,552      cycles                           #    1.885 GHz                    
+     3,391,311,970      instructions                     #    1.63  insn per cycle         
+       1.102116086 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2214) (512y:   39) (512z: 3787)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt
index ec2543c7d3..38bf1cd9c0 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2024-06-28_20:51:33
+DATE: 2024-08-08_20:18:48
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.709545e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.924352e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.924352e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.003824e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.049696e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.049696e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009071e+02 +- 5.002295e+01 )  GeV^-2
-TOTAL       :     0.461246 sec
+TOTAL       :     0.462593 sec
 INFO: No Floating Point Exceptions have been reported
-     1,992,722,705      cycles                           #    2.958 GHz                    
-     2,887,820,357      instructions                     #    1.45  insn per cycle         
-       0.730517540 seconds time elapsed
+     1,974,680,886      cycles                           #    2.933 GHz                    
+     2,925,643,074      instructions                     #    1.48  insn per cycle         
+       0.731432096 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -79,18 +79,20 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.683273e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.454394e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.454394e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.700147e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.536036e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.536036e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.737499e+02 +- 4.776369e+02 )  GeV^-2
-TOTAL       :     0.643585 sec
+TOTAL       :     0.641753 sec
 INFO: No Floating Point Exceptions have been reported
-     2,581,491,261      cycles                           #    2.948 GHz                    
-     3,956,724,353      instructions                     #    1.53  insn per cycle         
-       0.932697919 seconds time elapsed
+     2,565,792,794      cycles                           #    2.944 GHz                    
+     3,938,395,338      instructions                     #    1.53  insn per cycle         
+       0.930086671 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -108,20 +110,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.609102e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.622760e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.622760e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.551720e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.564557e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.564557e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002643e+01 )  GeV^-2
-TOTAL       :     6.305130 sec
+TOTAL       :     6.442209 sec
 INFO: No Floating Point Exceptions have been reported
-    19,239,711,426      cycles                           #    3.050 GHz                    
-    59,623,711,066      instructions                     #    3.10  insn per cycle         
-       6.309396622 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  972) (avx2:    0) (512y:    0) (512z:    0)
+    19,332,196,535      cycles                           #    2.999 GHz                    
+    59,617,412,156      instructions                     #    3.08  insn per cycle         
+       6.446330406 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  959) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -137,20 +140,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.217420e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.356633e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.356633e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.229338e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.368673e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.368673e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002643e+01 )  GeV^-2
-TOTAL       :     2.018951 sec
+TOTAL       :     2.012620 sec
 INFO: No Floating Point Exceptions have been reported
-     6,043,369,730      cycles                           #    2.989 GHz                    
-    17,117,928,718      instructions                     #    2.83  insn per cycle         
-       2.023188312 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 5867) (avx2:    0) (512y:    0) (512z:    0)
+     6,036,126,177      cycles                           #    2.994 GHz                    
+    17,109,389,715      instructions                     #    2.83  insn per cycle         
+       2.016763535 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 5855) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -166,20 +170,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.791038e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.854876e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.854876e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.740859e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.806079e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.806079e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008857e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     0.940101 sec
+TOTAL       :     0.964100 sec
 INFO: No Floating Point Exceptions have been reported
-     2,664,025,945      cycles                           #    2.826 GHz                    
-     6,230,939,077      instructions                     #    2.34  insn per cycle         
-       0.944279490 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5109) (512y:    0) (512z:    0)
+     2,661,000,573      cycles                           #    2.750 GHz                    
+     6,223,355,528      instructions                     #    2.34  insn per cycle         
+       0.968303872 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5091) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -195,20 +200,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.978050e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.055457e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.055457e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.800266e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.868707e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.868707e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008857e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     0.852829 sec
+TOTAL       :     0.933168 sec
 INFO: No Floating Point Exceptions have been reported
-     2,424,382,397      cycles                           #    2.831 GHz                    
-     5,834,398,104      instructions                     #    2.41  insn per cycle         
-       0.857053275 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4920) (512y:   36) (512z:    0)
+     2,423,820,124      cycles                           #    2.587 GHz                    
+     5,827,757,074      instructions                     #    2.40  insn per cycle         
+       0.937581508 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4896) (512y:   36) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -224,20 +230,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.506314e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.552051e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.552051e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.427750e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.470264e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.470264e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008856e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     1.115436 sec
+TOTAL       :     1.172250 sec
 INFO: No Floating Point Exceptions have been reported
-     2,104,767,379      cycles                           #    1.881 GHz                    
-     3,439,635,841      instructions                     #    1.63  insn per cycle         
-       1.119713640 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2238) (512y:   39) (512z: 3787)
+     2,098,127,039      cycles                           #    1.785 GHz                    
+     3,432,639,908      instructions                     #    1.64  insn per cycle         
+       1.176441537 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2214) (512y:   39) (512z: 3787)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt
index e3957db8ae..0ba4eb9609 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2024-06-28_20:21:02
+DATE: 2024-08-08_19:54:14
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.238812e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.928064e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.007026e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.278251e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.942254e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.021816e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008472e+02 +- 5.002447e+01 )  GeV^-2
-TOTAL       :     0.448531 sec
+TOTAL       :     0.446242 sec
 INFO: No Floating Point Exceptions have been reported
-     1,951,411,968      cycles                           #    2.941 GHz                    
-     2,766,558,002      instructions                     #    1.42  insn per cycle         
-       0.764509940 seconds time elapsed
+     1,972,500,118      cycles                           #    2.943 GHz                    
+     2,795,935,059      instructions                     #    1.42  insn per cycle         
+       0.726942838 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 221
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.030678e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.944358e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.002516e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.087674e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.947916e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.002420e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.630097e+02 +- 4.770717e+02 )  GeV^-2
-TOTAL       :     0.503686 sec
+TOTAL       :     0.494089 sec
 INFO: No Floating Point Exceptions have been reported
-     2,094,025,202      cycles                           #    2.866 GHz                    
-     2,990,436,652      instructions                     #    1.43  insn per cycle         
-       0.789461074 seconds time elapsed
+     2,134,934,271      cycles                           #    2.953 GHz                    
+     3,048,352,562      instructions                     #    1.43  insn per cycle         
+       0.779729616 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.571820e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.584887e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.584887e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.547958e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.560826e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.560826e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002643e+01 )  GeV^-2
-TOTAL       :     6.392812 sec
+TOTAL       :     6.448288 sec
 INFO: No Floating Point Exceptions have been reported
-    19,438,042,046      cycles                           #    3.039 GHz                    
-    59,356,933,962      instructions                     #    3.05  insn per cycle         
-       6.398066818 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1040) (avx2:    0) (512y:    0) (512z:    0)
+    19,391,308,595      cycles                           #    3.006 GHz                    
+    59,353,270,013      instructions                     #    3.06  insn per cycle         
+       6.452193679 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1027) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.739600e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.892454e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.892454e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.669188e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.820622e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.820622e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002643e+01 )  GeV^-2
-TOTAL       :     1.895118 sec
+TOTAL       :     1.907127 sec
 INFO: No Floating Point Exceptions have been reported
-     5,763,819,561      cycles                           #    3.036 GHz                    
-    16,856,373,051      instructions                     #    2.92  insn per cycle         
-       1.901175393 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 5622) (avx2:    0) (512y:    0) (512z:    0)
+     5,746,722,793      cycles                           #    3.009 GHz                    
+    16,850,100,573      instructions                     #    2.93  insn per cycle         
+       1.910695363 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 5610) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.580567e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.629147e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.629147e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.563334e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.611066e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.611066e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008857e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     1.057144 sec
+TOTAL       :     1.065485 sec
 INFO: No Floating Point Exceptions have been reported
-     3,018,223,480      cycles                           #    2.845 GHz                    
-     6,854,687,139      instructions                     #    2.27  insn per cycle         
-       1.063400892 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5739) (512y:    0) (512z:    0)
+     3,007,335,634      cycles                           #    2.814 GHz                    
+     6,847,154,679      instructions                     #    2.28  insn per cycle         
+       1.069270257 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5721) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.696812e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.752365e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.752365e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.689887e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.745378e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.745378e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008857e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     0.986126 sec
+TOTAL       :     0.986999 sec
 INFO: No Floating Point Exceptions have been reported
-     2,807,740,600      cycles                           #    2.837 GHz                    
-     6,444,005,515      instructions                     #    2.30  insn per cycle         
-       0.993303282 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5521) (512y:   22) (512z:    0)
+     2,801,128,869      cycles                           #    2.830 GHz                    
+     6,436,964,591      instructions                     #    2.30  insn per cycle         
+       0.990525270 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5497) (512y:   22) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.410526e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.449814e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.449814e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.390544e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.428498e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.428498e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008856e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     1.184868 sec
+TOTAL       :     1.197863 sec
 INFO: No Floating Point Exceptions have been reported
-     2,258,261,444      cycles                           #    1.902 GHz                    
-     3,761,765,213      instructions                     #    1.67  insn per cycle         
-       1.189729512 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2469) (512y:   29) (512z: 4082)
+     2,249,856,205      cycles                           #    1.874 GHz                    
+     3,755,019,516      instructions                     #    1.67  insn per cycle         
+       1.201521180 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2445) (512y:   29) (512z: 4082)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
index 23d2b99348..b56fab2636 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2024-06-28_20:21:23
+DATE: 2024-08-08_19:54:34
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.717161e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.040648e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.055523e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.873225e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.048994e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.062769e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.465962 sec
+TOTAL       :     0.468393 sec
 INFO: No Floating Point Exceptions have been reported
-     2,036,521,358      cycles                           #    2.930 GHz                    
-     2,917,582,884      instructions                     #    1.43  insn per cycle         
-       0.765122552 seconds time elapsed
+     2,013,463,276      cycles                           #    2.926 GHz                    
+     2,843,704,920      instructions                     #    1.41  insn per cycle         
+       0.746969806 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.106058e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.316620e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.329020e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.105683e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.317981e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.329407e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.602784 sec
+TOTAL       :     0.602858 sec
 INFO: No Floating Point Exceptions have been reported
-     2,487,016,630      cycles                           #    2.958 GHz                    
-     3,810,655,447      instructions                     #    1.53  insn per cycle         
-       0.899326894 seconds time elapsed
+     2,481,502,789      cycles                           #    2.952 GHz                    
+     3,777,860,843      instructions                     #    1.52  insn per cycle         
+       0.899194246 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.465065e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.477007e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.477007e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.428536e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.440162e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.440162e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.670736 sec
+TOTAL       :     6.766520 sec
 INFO: No Floating Point Exceptions have been reported
-    20,214,364,605      cycles                           #    3.030 GHz                    
-    60,955,289,572      instructions                     #    3.02  insn per cycle         
-       6.676699465 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1233) (avx2:    0) (512y:    0) (512z:    0)
+    20,196,006,274      cycles                           #    2.983 GHz                    
+    60,947,190,146      instructions                     #    3.02  insn per cycle         
+       6.770695543 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1220) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.688585e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.732660e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.732660e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.786932e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.830680e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.830680e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.517021 sec
+TOTAL       :     3.442084 sec
 INFO: No Floating Point Exceptions have been reported
-    10,461,894,641      cycles                           #    2.972 GHz                    
-    30,831,983,788      instructions                     #    2.95  insn per cycle         
-       3.523187933 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 5362) (avx2:    0) (512y:    0) (512z:    0)
+    10,443,979,206      cycles                           #    3.032 GHz                    
+    30,824,270,405      instructions                     #    2.95  insn per cycle         
+       3.445851321 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 5350) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.571199e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.744609e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.744609e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.470779e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.644870e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.644870e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.735065 sec
+TOTAL       :     1.749981 sec
 INFO: No Floating Point Exceptions have been reported
-     4,952,568,801      cycles                           #    2.850 GHz                    
-    11,366,247,235      instructions                     #    2.30  insn per cycle         
-       1.741580355 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4782) (512y:    0) (512z:    0)
+     4,950,819,939      cycles                           #    2.824 GHz                    
+    11,360,637,335      instructions                     #    2.29  insn per cycle         
+       1.753761622 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4764) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.082064e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.103867e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.103867e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.072349e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.094125e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.094125e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.540104 sec
+TOTAL       :     1.547382 sec
 INFO: No Floating Point Exceptions have been reported
-     4,393,896,240      cycles                           #    2.853 GHz                    
-    10,616,997,940      instructions                     #    2.42  insn per cycle         
-       1.544265101 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4515) (512y:   83) (512z:    0)
+     4,393,258,157      cycles                           #    2.833 GHz                    
+    10,610,345,317      instructions                     #    2.42  insn per cycle         
+       1.551099869 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4491) (512y:   83) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.307174e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.408952e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.408952e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.179185e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.278821e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.278821e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.267048 sec
+TOTAL       :     2.303939 sec
 INFO: No Floating Point Exceptions have been reported
-     4,251,639,994      cycles                           #    1.873 GHz                    
-     6,173,180,709      instructions                     #    1.45  insn per cycle         
-       2.272859583 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2141) (512y:  117) (512z: 3652)
+     4,243,069,453      cycles                           #    1.839 GHz                    
+     6,166,943,639      instructions                     #    1.45  insn per cycle         
+       2.307918272 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2117) (512y:  117) (512z: 3652)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt
index 8e7f8fcace..02b75df755 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2024-06-28_20:21:48
+DATE: 2024-08-08_19:54:59
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.687531e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.040941e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.054772e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.792781e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.038946e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.052598e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.466409 sec
+TOTAL       :     0.468036 sec
 INFO: No Floating Point Exceptions have been reported
-     2,036,972,851      cycles                           #    2.951 GHz                    
-     2,896,753,142      instructions                     #    1.42  insn per cycle         
-       0.912477306 seconds time elapsed
+     1,985,001,604      cycles                           #    2.907 GHz                    
+     2,766,137,748      instructions                     #    1.39  insn per cycle         
+       0.741175013 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.101045e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.308119e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.319880e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.100333e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.310665e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.321752e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.598149 sec
+TOTAL       :     0.598767 sec
 INFO: No Floating Point Exceptions have been reported
-     2,466,725,732      cycles                           #    2.955 GHz                    
-     3,804,698,508      instructions                     #    1.54  insn per cycle         
-       0.893566745 seconds time elapsed
+     2,453,028,425      cycles                           #    2.950 GHz                    
+     3,661,775,107      instructions                     #    1.49  insn per cycle         
+       0.892773102 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.451478e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.463107e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.463107e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.443765e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.455326e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.455326e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.707286 sec
+TOTAL       :     6.725300 sec
 INFO: No Floating Point Exceptions have been reported
-    20,287,610,611      cycles                           #    3.023 GHz                    
-    61,178,057,539      instructions                     #    3.02  insn per cycle         
-       6.713126605 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1285) (avx2:    0) (512y:    0) (512z:    0)
+    20,276,202,254      cycles                           #    3.014 GHz                    
+    61,176,047,563      instructions                     #    3.02  insn per cycle         
+       6.729394202 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1272) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.862832e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.908536e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.908536e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.782126e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.826623e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.826623e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.391985 sec
+TOTAL       :     3.445491 sec
 INFO: No Floating Point Exceptions have been reported
-    10,320,455,746      cycles                           #    3.040 GHz                    
-    30,541,990,731      instructions                     #    2.96  insn per cycle         
-       3.398241448 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 5166) (avx2:    0) (512y:    0) (512z:    0)
+    10,362,676,163      cycles                           #    3.005 GHz                    
+    30,536,337,790      instructions                     #    2.95  insn per cycle         
+       3.449270850 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 5154) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.179015e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.338205e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.338205e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.061590e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.221412e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.221412e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.807157 sec
+TOTAL       :     1.828348 sec
 INFO: No Floating Point Exceptions have been reported
-     5,150,435,949      cycles                           #    2.844 GHz                    
-    11,880,396,494      instructions                     #    2.31  insn per cycle         
-       1.815550042 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4893) (512y:    0) (512z:    0)
+     5,140,078,208      cycles                           #    2.807 GHz                    
+    11,874,984,280      instructions                     #    2.31  insn per cycle         
+       1.832218653 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4875) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.012527e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.031498e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.031498e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.004120e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.023004e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.023004e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.641252 sec
+TOTAL       :     1.651331 sec
 INFO: No Floating Point Exceptions have been reported
-     4,676,385,612      cycles                           #    2.845 GHz                    
-    11,173,497,252      instructions                     #    2.39  insn per cycle         
-       1.647761659 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4520) (512y:  238) (512z:    0)
+     4,668,851,118      cycles                           #    2.822 GHz                    
+    11,168,266,795      instructions                     #    2.39  insn per cycle         
+       1.655171295 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4496) (512y:  238) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [h
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.320732e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.423530e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.423530e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.200167e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.298361e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.298361e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.262199 sec
+TOTAL       :     2.297641 sec
 INFO: No Floating Point Exceptions have been reported
-     4,261,802,804      cycles                           #    1.881 GHz                    
-     6,412,763,401      instructions                     #    1.50  insn per cycle         
-       2.269882721 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2037) (512y:  163) (512z: 3730)
+     4,253,384,705      cycles                           #    1.849 GHz                    
+     6,407,420,579      instructions                     #    1.51  insn per cycle         
+       2.301529661 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2013) (512y:  163) (512z: 3730)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
index 606ff35de6..ab0ea6da4a 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-06-28_20:22:13
+DATE: 2024-08-08_19:55:25
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.459828e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.485766e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.487928e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.488153e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.514881e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.516998e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.524789 sec
+TOTAL       :     0.525204 sec
 INFO: No Floating Point Exceptions have been reported
-     2,242,199,367      cycles                           #    2.960 GHz                    
-     3,521,289,668      instructions                     #    1.57  insn per cycle         
-       1.006468673 seconds time elapsed
+     2,218,473,016      cycles                           #    2.933 GHz                    
+     3,463,122,045      instructions                     #    1.56  insn per cycle         
+       0.815780769 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.137707e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.166699e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.167947e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.132223e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.161610e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.162761e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.031945 sec
+TOTAL       :     3.033638 sec
 INFO: No Floating Point Exceptions have been reported
-     9,954,167,339      cycles                           #    3.019 GHz                    
-    21,948,236,097      instructions                     #    2.20  insn per cycle         
-       3.356435303 seconds time elapsed
+     9,809,726,664      cycles                           #    2.987 GHz                    
+    20,834,555,403      instructions                     #    2.12  insn per cycle         
+       3.343721812 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.942202e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.943197e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.943197e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.933106e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.934097e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.934097e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.455078 sec
+TOTAL       :     8.490765 sec
 INFO: No Floating Point Exceptions have been reported
-    25,685,427,078      cycles                           #    3.037 GHz                    
-    78,966,363,864      instructions                     #    3.07  insn per cycle         
-       8.461150899 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 4856) (avx2:    0) (512y:    0) (512z:    0)
+    25,657,464,355      cycles                           #    3.021 GHz                    
+    78,956,678,283      instructions                     #    3.08  insn per cycle         
+       8.494928864 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 4843) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.637691e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.640965e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.640965e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.556899e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.560135e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.560135e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.519192 sec
+TOTAL       :     4.617381 sec
 INFO: No Floating Point Exceptions have been reported
-    13,125,941,934      cycles                           #    2.903 GHz                    
-    39,566,682,586      instructions                     #    3.01  insn per cycle         
-       4.525791931 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13211) (avx2:    0) (512y:    0) (512z:    0)
+    13,096,002,004      cycles                           #    2.834 GHz                    
+    39,560,686,282      instructions                     #    3.02  insn per cycle         
+       4.621306822 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13199) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.347155e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.363965e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.363965e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.312969e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.330861e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.330861e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.975226 sec
+TOTAL       :     1.979952 sec
 INFO: No Floating Point Exceptions have been reported
-     5,639,645,611      cycles                           #    2.852 GHz                    
-    13,831,852,045      instructions                     #    2.45  insn per cycle         
-       1.981608604 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11548) (512y:    0) (512z:    0)
+     5,592,710,730      cycles                           #    2.820 GHz                    
+    13,825,002,673      instructions                     #    2.47  insn per cycle         
+       1.983978333 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11530) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.876912e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.896755e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.896755e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.448686e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.470931e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.470931e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.858025 sec
+TOTAL       :     1.742543 sec
 INFO: No Floating Point Exceptions have been reported
-     4,952,320,771      cycles                           #    2.661 GHz                    
-    12,513,090,040      instructions                     #    2.53  insn per cycle         
-       1.864384055 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10473) (512y:   88) (512z:    0)
+     4,950,283,084      cycles                           #    2.836 GHz                    
+    12,507,380,266      instructions                     #    2.53  insn per cycle         
+       1.746261350 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10449) (512y:   88) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.393007e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.406397e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.406397e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.208746e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.222007e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.222007e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.229067 sec
+TOTAL       :     2.282175 sec
 INFO: No Floating Point Exceptions have been reported
-     4,149,880,862      cycles                           #    1.859 GHz                    
-     6,398,246,742      instructions                     #    1.54  insn per cycle         
-       2.235781935 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1998) (512y:  102) (512z: 9391)
+     4,146,883,314      cycles                           #    1.815 GHz                    
+     6,393,760,552      instructions                     #    1.54  insn per cycle         
+       2.285979679 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1974) (512y:  102) (512z: 9391)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt
index 3e1ba5193c..9aa087c04f 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-06-28_20:52:20
+DATE: 2024-08-08_20:19:34
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.138565e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.475918e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.475918e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.112227e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.443687e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.443687e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.517848 sec
+TOTAL       :     0.518381 sec
 INFO: No Floating Point Exceptions have been reported
-     2,215,015,851      cycles                           #    2.973 GHz                    
-     3,497,258,009      instructions                     #    1.58  insn per cycle         
-       0.806722822 seconds time elapsed
+     2,176,799,915      cycles                           #    2.911 GHz                    
+     3,495,470,615      instructions                     #    1.61  insn per cycle         
+       0.808139854 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -79,18 +79,20 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.650886e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.124920e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.124920e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.648774e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.128576e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.128576e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.300589 sec
+TOTAL       :     3.310822 sec
 INFO: No Floating Point Exceptions have been reported
-    10,784,891,679      cycles                           #    3.015 GHz                    
-    23,660,211,568      instructions                     #    2.19  insn per cycle         
-       3.633362684 seconds time elapsed
+    10,679,469,031      cycles                           #    2.985 GHz                    
+    23,830,814,413      instructions                     #    2.23  insn per cycle         
+       3.633830469 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -108,20 +110,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.954605e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.955629e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.955629e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.923317e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.924229e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.924229e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.405087 sec
+TOTAL       :     8.538018 sec
 INFO: No Floating Point Exceptions have been reported
-    25,704,111,857      cycles                           #    3.057 GHz                    
-    78,968,382,574      instructions                     #    3.07  insn per cycle         
-       8.409623563 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 4856) (avx2:    0) (512y:    0) (512z:    0)
+    25,699,355,856      cycles                           #    3.009 GHz                    
+    78,962,606,878      instructions                     #    3.07  insn per cycle         
+       8.542523167 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 4843) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -137,20 +140,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.650719e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.654171e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.654171e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.605150e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.608587e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.608587e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.506089 sec
+TOTAL       :     4.559554 sec
 INFO: No Floating Point Exceptions have been reported
-    13,135,898,885      cycles                           #    2.913 GHz                    
-    39,579,018,476      instructions                     #    3.01  insn per cycle         
-       4.510608241 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13211) (avx2:    0) (512y:    0) (512z:    0)
+    13,117,342,563      cycles                           #    2.875 GHz                    
+    39,574,473,831      instructions                     #    3.02  insn per cycle         
+       4.563915289 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13199) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -166,20 +170,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.359873e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.377612e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.377612e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.187581e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.204828e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.204828e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.975786 sec
+TOTAL       :     2.014036 sec
 INFO: No Floating Point Exceptions have been reported
-     5,657,211,223      cycles                           #    2.858 GHz                    
-    13,841,168,100      instructions                     #    2.45  insn per cycle         
-       1.980284784 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11548) (512y:    0) (512z:    0)
+     5,605,896,422      cycles                           #    2.779 GHz                    
+    13,833,979,214      instructions                     #    2.47  insn per cycle         
+       2.018562637 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11530) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -195,20 +200,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.530978e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.555255e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.555255e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.243444e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.265975e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.265975e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.734840 sec
+TOTAL       :     1.784658 sec
 INFO: No Floating Point Exceptions have been reported
-     4,965,921,813      cycles                           #    2.857 GHz                    
-    12,523,097,371      instructions                     #    2.52  insn per cycle         
-       1.739276354 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10473) (512y:   88) (512z:    0)
+     4,964,309,016      cycles                           #    2.776 GHz                    
+    12,516,237,329      instructions                     #    2.52  insn per cycle         
+       1.788990266 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10449) (512y:   88) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -224,20 +230,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.449141e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.463773e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.463773e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.077629e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.090790e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.090790e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.215526 sec
+TOTAL       :     2.328055 sec
 INFO: No Floating Point Exceptions have been reported
-     4,165,473,123      cycles                           #    1.877 GHz                    
-     6,409,134,884      instructions                     #    1.54  insn per cycle         
-       2.220135516 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1998) (512y:  102) (512z: 9391)
+     4,162,316,275      cycles                           #    1.785 GHz                    
+     6,401,996,872      instructions                     #    1.54  insn per cycle         
+       2.332653341 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1974) (512y:  102) (512z: 9391)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt
index 36e35a9f65..ff7f772058 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-06-28_21:01:57
+DATE: 2024-08-08_20:29:47
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.474154e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.501832e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.503997e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.507693e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.534445e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.536631e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     0.511354 sec
+TOTAL       :     0.514407 sec
 INFO: No Floating Point Exceptions have been reported
-     2,188,138,832      cycles                           #    2.950 GHz                    
-     3,455,450,866      instructions                     #    1.58  insn per cycle         
-       0.803741465 seconds time elapsed
+     2,174,406,271      cycles                           #    2.930 GHz                    
+     3,461,893,969      instructions                     #    1.59  insn per cycle         
+       0.803766234 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --common
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.148640e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.179495e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.180800e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.147428e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.177075e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.178326e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.252232e+02 +- 1.234346e+02 )  GeV^-4
-TOTAL       :     3.129383 sec
+TOTAL       :     3.120976 sec
 INFO: No Floating Point Exceptions have been reported
-    10,087,636,146      cycles                           #    2.985 GHz                    
-    20,676,039,095      instructions                     #    2.05  insn per cycle         
-       3.437978751 seconds time elapsed
+    10,019,214,394      cycles                           #    2.972 GHz                    
+    21,025,350,474      instructions                     #    2.10  insn per cycle         
+       3.430265997 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.953051e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.954062e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.954062e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.913744e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.914711e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.914711e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     8.408463 sec
+TOTAL       :     8.577743 sec
 INFO: No Floating Point Exceptions have been reported
-    25,683,043,283      cycles                           #    3.053 GHz                    
-    78,961,949,285      instructions                     #    3.07  insn per cycle         
-       8.412560493 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 4856) (avx2:    0) (512y:    0) (512z:    0)
+    25,670,651,990      cycles                           #    2.992 GHz                    
+    78,955,406,875      instructions                     #    3.08  insn per cycle         
+       8.581763598 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 4843) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.538179e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.541408e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.541408e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.605176e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.608431e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.608431e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     4.646642 sec
+TOTAL       :     4.556655 sec
 INFO: No Floating Point Exceptions have been reported
-    13,132,428,466      cycles                           #    2.824 GHz                    
-    39,566,359,526      instructions                     #    3.01  insn per cycle         
-       4.650849397 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13211) (avx2:    0) (512y:    0) (512z:    0)
+    13,109,013,329      cycles                           #    2.875 GHz                    
+    39,558,662,551      instructions                     #    3.02  insn per cycle         
+       4.560750410 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13199) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.368365e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.385826e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.385826e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.281071e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.297965e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.297965e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     1.971160 sec
+TOTAL       :     1.988611 sec
 INFO: No Floating Point Exceptions have been reported
-     5,647,787,452      cycles                           #    2.861 GHz                    
-    13,829,507,630      instructions                     #    2.45  insn per cycle         
-       1.975214929 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11548) (512y:    0) (512z:    0)
+     5,595,768,969      cycles                           #    2.809 GHz                    
+    13,822,292,745      instructions                     #    2.47  insn per cycle         
+       1.992702302 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11530) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.502828e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.525337e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.525337e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.896901e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.917572e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.917572e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     1.736925 sec
+TOTAL       :     1.851324 sec
 INFO: No Floating Point Exceptions have been reported
-     4,952,829,535      cycles                           #    2.846 GHz                    
-    12,510,405,299      instructions                     #    2.53  insn per cycle         
-       1.741019325 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10473) (512y:   88) (512z:    0)
+     4,949,173,347      cycles                           #    2.669 GHz                    
+    12,503,287,563      instructions                     #    2.53  insn per cycle         
+       1.855415164 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10449) (512y:   88) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.365853e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.379911e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.379911e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.307417e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.320405e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.320405e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     2.237795 sec
+TOTAL       :     2.252212 sec
 INFO: No Floating Point Exceptions have been reported
-     4,156,890,577      cycles                           #    1.855 GHz                    
-     6,396,158,206      instructions                     #    1.54  insn per cycle         
-       2.242043953 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1998) (512y:  102) (512z: 9391)
+     4,148,121,362      cycles                           #    1.839 GHz                    
+     6,388,958,727      instructions                     #    1.54  insn per cycle         
+       2.256422988 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1974) (512y:  102) (512z: 9391)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt
index 207f46b632..8c55b22907 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-06-28_20:59:12
+DATE: 2024-08-08_20:26:59
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.461876e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.490862e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.493015e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.458961e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.485253e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.488049e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.509960 sec
+TOTAL       :     0.514464 sec
 INFO: No Floating Point Exceptions have been reported
-     2,189,426,255      cycles                           #    2.953 GHz                    
-     3,479,572,247      instructions                     #    1.59  insn per cycle         
-       0.800831207 seconds time elapsed
+     2,130,639,833      cycles                           #    2.860 GHz                    
+     3,343,542,179      instructions                     #    1.57  insn per cycle         
+       0.805221680 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.145425e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.176585e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.177885e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.127051e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.156110e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.157363e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.069887 sec
+TOTAL       :     3.075386 sec
 INFO: No Floating Point Exceptions have been reported
-     9,983,044,404      cycles                           #    3.007 GHz                    
-    22,599,909,333      instructions                     #    2.26  insn per cycle         
-       3.378270496 seconds time elapsed
+     9,595,195,883      cycles                           #    2.879 GHz                    
+    21,169,008,885      instructions                     #    2.21  insn per cycle         
+       3.388723748 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.948901e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.949866e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.949866e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.853624e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.854505e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.854505e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.424966 sec
+TOTAL       :     8.854273 sec
 INFO: No Floating Point Exceptions have been reported
-    25,683,040,628      cycles                           #    3.047 GHz                    
-    78,961,831,753      instructions                     #    3.07  insn per cycle         
-       8.429259681 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 4856) (avx2:    0) (512y:    0) (512z:    0)
+    25,673,092,183      cycles                           #    2.899 GHz                    
+    78,956,489,516      instructions                     #    3.08  insn per cycle         
+       8.858619563 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 4843) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.653114e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.656450e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.656450e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.555877e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.559175e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.559175e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.498779 sec
+TOTAL       :     4.618236 sec
 INFO: No Floating Point Exceptions have been reported
-    13,124,985,281      cycles                           #    2.916 GHz                    
-    39,566,085,965      instructions                     #    3.01  insn per cycle         
-       4.502971209 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13211) (avx2:    0) (512y:    0) (512z:    0)
+    13,105,607,424      cycles                           #    2.836 GHz                    
+    39,562,262,758      instructions                     #    3.02  insn per cycle         
+       4.622614183 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13199) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.369414e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.386730e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.386730e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.117944e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.134423e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.134423e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.969220 sec
+TOTAL       :     2.026386 sec
 INFO: No Floating Point Exceptions have been reported
-     5,641,945,348      cycles                           #    2.860 GHz                    
-    13,831,299,800      instructions                     #    2.45  insn per cycle         
-       1.973388441 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11548) (512y:    0) (512z:    0)
+     5,589,116,983      cycles                           #    2.754 GHz                    
+    13,823,429,494      instructions                     #    2.47  insn per cycle         
+       2.030436364 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11530) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.516955e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.539518e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.539518e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.385930e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.407557e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.407557e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.732837 sec
+TOTAL       :     1.753538 sec
 INFO: No Floating Point Exceptions have been reported
-     4,949,423,150      cycles                           #    2.851 GHz                    
-    12,512,060,098      instructions                     #    2.53  insn per cycle         
-       1.737031693 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10473) (512y:   88) (512z:    0)
+     4,940,731,112      cycles                           #    2.812 GHz                    
+    12,505,003,217      instructions                     #    2.53  insn per cycle         
+       1.757654269 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10449) (512y:   88) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.477478e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.491068e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.491068e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.329600e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.342625e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.342625e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.202805 sec
+TOTAL       :     2.243900 sec
 INFO: No Floating Point Exceptions have been reported
-     4,149,646,633      cycles                           #    1.881 GHz                    
-     6,397,797,180      instructions                     #    1.54  insn per cycle         
-       2.207008648 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1998) (512y:  102) (512z: 9391)
+     4,145,687,524      cycles                           #    1.845 GHz                    
+     6,390,893,367      instructions                     #    1.54  insn per cycle         
+       2.248144727 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1974) (512y:  102) (512z: 9391)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt
index 151bf82020..28e1d95034 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-06-28_20:56:33
+DATE: 2024-08-08_20:24:16
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,15 +50,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.203392e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.508627e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.510845e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.229613e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.520921e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.523094e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.513307 sec
+TOTAL       :     0.513514 sec
 INFO: No Floating Point Exceptions have been reported
-     2,189,978,735      cycles                           #    2.953 GHz                    
-     3,500,307,012      instructions                     #    1.60  insn per cycle         
-       0.803039724 seconds time elapsed
+     2,168,346,936      cycles                           #    2.927 GHz                    
+     3,433,459,385      instructions                     #    1.58  insn per cycle         
+       0.802152079 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -70,18 +70,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.744166e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.178157e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.179382e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.733483e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.157890e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.159150e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.194253 sec
+TOTAL       :     3.199522 sec
 INFO: No Floating Point Exceptions have been reported
-    10,437,486,034      cycles                           #    3.029 GHz                    
-    24,018,289,609      instructions                     #    2.30  insn per cycle         
-       3.502431258 seconds time elapsed
+    10,294,194,017      cycles                           #    2.982 GHz                    
+    21,521,466,269      instructions                     #    2.09  insn per cycle         
+       3.508277099 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -98,20 +100,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.949609e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.950599e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.950599e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.923954e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.924900e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.924900e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.422477 sec
+TOTAL       :     8.530428 sec
 INFO: No Floating Point Exceptions have been reported
-    25,685,602,378      cycles                           #    3.049 GHz                    
-    78,962,442,414      instructions                     #    3.07  insn per cycle         
-       8.426740936 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 4856) (avx2:    0) (512y:    0) (512z:    0)
+    25,661,796,778      cycles                           #    3.007 GHz                    
+    78,954,509,974      instructions                     #    3.08  insn per cycle         
+       8.534417643 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 4843) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -126,20 +129,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.636823e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.640239e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.640239e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.615782e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.619130e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.619130e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.518815 sec
+TOTAL       :     4.541944 sec
 INFO: No Floating Point Exceptions have been reported
-    13,125,924,467      cycles                           #    2.903 GHz                    
-    39,566,231,833      instructions                     #    3.01  insn per cycle         
-       4.523062408 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13211) (avx2:    0) (512y:    0) (512z:    0)
+    13,126,189,517      cycles                           #    2.888 GHz                    
+    39,559,744,202      instructions                     #    3.01  insn per cycle         
+       4.546027002 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13199) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -154,20 +158,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.318809e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.336114e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.336114e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.299850e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.317113e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.317113e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.981355 sec
+TOTAL       :     1.982404 sec
 INFO: No Floating Point Exceptions have been reported
-     5,639,339,757      cycles                           #    2.841 GHz                    
-    13,830,515,441      instructions                     #    2.45  insn per cycle         
-       1.985600666 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11548) (512y:    0) (512z:    0)
+     5,586,639,772      cycles                           #    2.813 GHz                    
+    13,823,166,385      instructions                     #    2.47  insn per cycle         
+       1.986590396 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11530) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -182,20 +187,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.503636e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.525847e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.525847e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.384353e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.406906e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.406906e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.735480 sec
+TOTAL       :     1.753945 sec
 INFO: No Floating Point Exceptions have been reported
-     4,951,242,002      cycles                           #    2.848 GHz                    
-    12,513,124,213      instructions                     #    2.53  insn per cycle         
-       1.739580059 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10473) (512y:   88) (512z:    0)
+     4,942,572,018      cycles                           #    2.813 GHz                    
+    12,504,933,165      instructions                     #    2.53  insn per cycle         
+       1.758084275 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10449) (512y:   88) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -210,20 +216,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.313441e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.326736e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.326736e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.317460e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.330821e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.330821e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.252460 sec
+TOTAL       :     2.247518 sec
 INFO: No Floating Point Exceptions have been reported
-     4,155,241,568      cycles                           #    1.842 GHz                    
-     6,398,190,319      instructions                     #    1.54  insn per cycle         
-       2.256689169 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1998) (512y:  102) (512z: 9391)
+     4,146,774,770      cycles                           #    1.843 GHz                    
+     6,391,452,350      instructions                     #    1.54  insn per cycle         
+       2.251569316 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1974) (512y:  102) (512z: 9391)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt
index cbb644cf1f..ef490ee27f 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-06-28_20:22:45
+DATE: 2024-08-08_19:55:57
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.472054e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.498222e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.500508e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.468386e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.495424e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.497730e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.523112 sec
+TOTAL       :     0.528153 sec
 INFO: No Floating Point Exceptions have been reported
-     2,236,042,620      cycles                           #    2.962 GHz                    
-     3,533,085,294      instructions                     #    1.58  insn per cycle         
-       0.932117194 seconds time elapsed
+     2,223,041,093      cycles                           #    2.885 GHz                    
+     3,357,279,580      instructions                     #    1.51  insn per cycle         
+       0.829273079 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.141957e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.170783e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.171976e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.133736e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.163273e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.164433e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.018899 sec
+TOTAL       :     3.026404 sec
 INFO: No Floating Point Exceptions have been reported
-     9,915,480,311      cycles                           #    3.029 GHz                    
-    22,358,141,614      instructions                     #    2.25  insn per cycle         
-       3.328323631 seconds time elapsed
+     9,787,087,404      cycles                           #    2.984 GHz                    
+    20,868,236,699      instructions                     #    2.13  insn per cycle         
+       3.335921488 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.935763e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.936655e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.936655e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.930451e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.931397e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.931397e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.482533 sec
+TOTAL       :     8.501967 sec
 INFO: No Floating Point Exceptions have been reported
-    25,605,418,102      cycles                           #    3.018 GHz                    
-    78,706,969,538      instructions                     #    3.07  insn per cycle         
-       8.488935811 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 4205) (avx2:    0) (512y:    0) (512z:    0)
+    25,635,869,243      cycles                           #    3.014 GHz                    
+    78,699,985,409      instructions                     #    3.07  insn per cycle         
+       8.506017009 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 4192) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.641012e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.644459e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.644459e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.635004e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.638325e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.638325e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.515146 sec
+TOTAL       :     4.518323 sec
 INFO: No Floating Point Exceptions have been reported
-    13,056,661,799      cycles                           #    2.891 GHz                    
-    39,457,635,850      instructions                     #    3.02  insn per cycle         
-       4.521450799 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:12985) (avx2:    0) (512y:    0) (512z:    0)
+    13,043,304,130      cycles                           #    2.885 GHz                    
+    39,451,387,281      instructions                     #    3.02  insn per cycle         
+       4.522544486 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:12973) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.270313e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.286680e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.286680e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.103214e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.119837e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.119837e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.992198 sec
+TOTAL       :     2.030819 sec
 INFO: No Floating Point Exceptions have been reported
-     5,675,658,087      cycles                           #    2.844 GHz                    
-    13,917,752,986      instructions                     #    2.45  insn per cycle         
-       1.998576575 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11610) (512y:    0) (512z:    0)
+     5,706,370,481      cycles                           #    2.806 GHz                    
+    13,911,650,507      instructions                     #    2.44  insn per cycle         
+       2.034636014 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11592) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.387023e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.408110e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.408110e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.209342e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.231718e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.231718e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.756363 sec
+TOTAL       :     1.787809 sec
 INFO: No Floating Point Exceptions have been reported
-     4,993,263,033      cycles                           #    2.837 GHz                    
-    12,609,677,124      instructions                     #    2.53  insn per cycle         
-       1.762986317 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10457) (512y:  240) (512z:    0)
+     4,991,279,132      cycles                           #    2.786 GHz                    
+    12,604,125,286      instructions                     #    2.53  insn per cycle         
+       1.792337833 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10433) (512y:  240) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.319140e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.332490e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.332490e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.276351e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.289893e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.289893e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.250531 sec
+TOTAL       :     2.260957 sec
 INFO: No Floating Point Exceptions have been reported
-     4,162,515,901      cycles                           #    1.847 GHz                    
-     6,507,204,315      instructions                     #    1.56  insn per cycle         
-       2.256696447 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1774) (512y:  194) (512z: 9387)
+     4,149,253,590      cycles                           #    1.833 GHz                    
+     6,500,352,718      instructions                     #    1.57  insn per cycle         
+       2.264815173 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1750) (512y:  194) (512z: 9387)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt
index 6a2d99b3cd..bbaea3caef 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-06-28_20:43:06
+DATE: 2024-08-08_20:10:19
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.253055e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.278247e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.280335e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.246678e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.268467e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.270191e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.536312 sec
+TOTAL       :     0.534085 sec
 INFO: No Floating Point Exceptions have been reported
-     2,250,274,110      cycles                           #    2.945 GHz                    
-     3,555,189,986      instructions                     #    1.58  insn per cycle         
-       0.822638069 seconds time elapsed
+     2,285,518,624      cycles                           #    2.953 GHz                    
+     3,580,561,444      instructions                     #    1.57  insn per cycle         
+       0.832119310 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.760808e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.789087e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.790247e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.761384e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.784291e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.785252e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.306329 sec
+TOTAL       :     3.301764 sec
 INFO: No Floating Point Exceptions have been reported
-    10,724,834,853      cycles                           #    3.010 GHz                    
-    24,802,280,834      instructions                     #    2.31  insn per cycle         
-       3.619173157 seconds time elapsed
+    10,582,525,253      cycles                           #    2.981 GHz                    
+    22,709,986,647      instructions                     #    2.15  insn per cycle         
+       3.609006709 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.381772e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.382276e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.382276e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.342825e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.343311e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.343311e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :    37.438618 sec
+TOTAL       :    37.771526 sec
 INFO: No Floating Point Exceptions have been reported
-   113,491,355,747      cycles                           #    3.031 GHz                    
-   144,836,012,190      instructions                     #    1.28  insn per cycle         
-      37.442851666 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:21407) (avx2:    0) (512y:    0) (512z:    0)
+   112,991,669,428      cycles                           #    2.992 GHz                    
+   144,862,430,473      instructions                     #    1.28  insn per cycle         
+      37.775737563 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:21361) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.196978e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.199549e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.199549e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.180115e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.182680e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.182680e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     5.139131 sec
+TOTAL       :     5.162984 sec
 INFO: No Floating Point Exceptions have been reported
-    14,751,408,883      cycles                           #    2.869 GHz                    
-    37,659,055,647      instructions                     #    2.55  insn per cycle         
-       5.143466599 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:68265) (avx2:    0) (512y:    0) (512z:    0)
+    14,747,517,010      cycles                           #    2.855 GHz                    
+    37,650,782,777      instructions                     #    2.55  insn per cycle         
+       5.167050022 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:68253) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.244187e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.258766e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.258766e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.587961e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.601478e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.601478e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.273650 sec
+TOTAL       :     2.167267 sec
 INFO: No Floating Point Exceptions have been reported
-     6,132,413,035      cycles                           #    2.692 GHz                    
-    13,068,215,974      instructions                     #    2.13  insn per cycle         
-       2.278418111 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:46983) (512y:    0) (512z:    0)
+     6,123,933,660      cycles                           #    2.822 GHz                    
+    13,061,783,520      instructions                     #    2.13  insn per cycle         
+       2.171395105 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:46965) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.165285e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.187084e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.187084e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.164851e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.185111e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.185111e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.798975 sec
+TOTAL       :     1.795482 sec
 INFO: No Floating Point Exceptions have been reported
-     5,073,754,093      cycles                           #    2.815 GHz                    
-    11,460,503,333      instructions                     #    2.26  insn per cycle         
-       1.803277093 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:40514) (512y:  285) (512z:    0)
+     5,057,846,668      cycles                           #    2.812 GHz                    
+    11,453,287,308      instructions                     #    2.26  insn per cycle         
+       1.799543537 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:40490) (512y:  285) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.668145e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.682901e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.682901e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.447733e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.461062e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.461062e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.148661 sec
+TOTAL       :     2.208265 sec
 INFO: No Floating Point Exceptions have been reported
-     3,962,496,219      cycles                           #    1.845 GHz                    
-     5,934,435,276      instructions                     #    1.50  insn per cycle         
-       2.153008467 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2456) (512y:  337) (512z:39348)
+     3,952,574,407      cycles                           #    1.787 GHz                    
+     5,928,010,897      instructions                     #    1.50  insn per cycle         
+       2.212410955 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2432) (512y:  337) (512z:39348)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt
index 1c810d5448..7583c01cf4 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-06-28_20:44:10
+DATE: 2024-08-08_20:11:26
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.265225e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.289962e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.291972e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.275171e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.299147e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.301063e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.532495 sec
+TOTAL       :     0.533669 sec
 INFO: No Floating Point Exceptions have been reported
-     2,271,399,447      cycles                           #    2.937 GHz                    
-     3,489,556,693      instructions                     #    1.54  insn per cycle         
-       0.830540125 seconds time elapsed
+     2,269,961,618      cycles                           #    2.940 GHz                    
+     3,538,568,106      instructions                     #    1.56  insn per cycle         
+       0.830876846 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.744110e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.772071e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.773220e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.755572e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.778494e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.779486e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.299405 sec
+TOTAL       :     3.298195 sec
 INFO: No Floating Point Exceptions have been reported
-    10,705,227,455      cycles                           #    3.011 GHz                    
-    24,486,514,480      instructions                     #    2.29  insn per cycle         
-       3.611307721 seconds time elapsed
+    10,673,699,971      cycles                           #    3.000 GHz                    
+    24,748,682,176      instructions                     #    2.32  insn per cycle         
+       3.615699896 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2
@@ -95,26 +97,27 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.357685e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.358166e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.358166e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.321186e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.321644e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.321644e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :    37.644054 sec
+TOTAL       :    37.957787 sec
 INFO: No Floating Point Exceptions have been reported
-   113,873,744,260      cycles                           #    3.025 GHz                    
-   144,286,882,817      instructions                     #    1.27  insn per cycle         
-      37.648261793 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:21037) (avx2:    0) (512y:    0) (512z:    0)
+   113,686,913,957      cycles                           #    2.995 GHz                    
+   144,259,453,305      instructions                     #    1.27  insn per cycle         
+      37.961860960 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:20934) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
-Avg ME (F77/C++)    = 6.6266731198140450E-004
-Relative difference = 2.83729918072716e-07
+Avg ME (F77/C++)    = 6.6266731198140439E-004
+Relative difference = 2.8372991823632784e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.015988e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.018316e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.018316e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.073725e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.076096e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.076096e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     5.446710 sec
+TOTAL       :     5.341043 sec
 INFO: No Floating Point Exceptions have been reported
-    15,292,753,074      cycles                           #    2.806 GHz                    
-    38,397,818,203      instructions                     #    2.51  insn per cycle         
-       5.451032302 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:69655) (avx2:    0) (512y:    0) (512z:    0)
+    15,271,797,585      cycles                           #    2.858 GHz                    
+    38,390,165,623      instructions                     #    2.51  insn per cycle         
+       5.345237036 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:69643) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.782928e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.797665e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.797665e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.624786e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.638797e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.638797e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.116755 sec
+TOTAL       :     2.157053 sec
 INFO: No Floating Point Exceptions have been reported
-     6,022,769,262      cycles                           #    2.841 GHz                    
-    12,941,827,772      instructions                     #    2.15  insn per cycle         
-       2.120967579 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:46109) (512y:    0) (512z:    0)
+     6,008,150,983      cycles                           #    2.781 GHz                    
+    12,934,571,742      instructions                     #    2.15  insn per cycle         
+       2.161176604 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:46091) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.211510e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.233072e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.233072e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.062477e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.083007e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.083007e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.789778 sec
+TOTAL       :     1.815728 sec
 INFO: No Floating Point Exceptions have been reported
-     5,100,919,756      cycles                           #    2.845 GHz                    
-    11,456,622,218      instructions                     #    2.25  insn per cycle         
-       1.793897318 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:40158) (512y:  219) (512z:    0)
+     5,090,244,384      cycles                           #    2.798 GHz                    
+    11,449,331,673      instructions                     #    2.25  insn per cycle         
+       1.819810741 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:40134) (512y:  219) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.723861e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.738531e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.738531e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.561516e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.575406e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.575406e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.132674 sec
+TOTAL       :     2.175028 sec
 INFO: No Floating Point Exceptions have been reported
-     3,961,317,157      cycles                           #    1.854 GHz                    
-     5,896,891,551      instructions                     #    1.49  insn per cycle         
-       2.137219432 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1971) (512y:  259) (512z:38926)
+     3,947,332,966      cycles                           #    1.812 GHz                    
+     5,889,708,142      instructions                     #    1.49  insn per cycle         
+       2.179231650 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1947) (512y:  259) (512z:38926)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
index ee69d40ca4..52d8759019 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-06-28_20:23:18
+DATE: 2024-08-08_19:56:30
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.969401e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.014832e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.020101e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.984596e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.027561e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.032406e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.489746 sec
+TOTAL       :     0.485881 sec
 INFO: No Floating Point Exceptions have been reported
-     2,016,931,047      cycles                           #    2.848 GHz                    
-     3,025,159,890      instructions                     #    1.50  insn per cycle         
-       0.927349827 seconds time elapsed
+     2,058,871,536      cycles                           #    2.917 GHz                    
+     3,048,657,677      instructions                     #    1.48  insn per cycle         
+       0.765585250 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.205747e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.264876e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.267666e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.127584e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.186636e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.189605e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.784442 sec
+TOTAL       :     1.790632 sec
 INFO: No Floating Point Exceptions have been reported
-     5,821,380,715      cycles                           #    2.891 GHz                    
-    11,584,732,165      instructions                     #    1.99  insn per cycle         
-       2.072616133 seconds time elapsed
+     5,978,175,900      cycles                           #    2.960 GHz                    
+    12,554,229,706      instructions                     #    2.10  insn per cycle         
+       2.078428019 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.001998e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.003043e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.003043e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.983107e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.984075e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.984075e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.202217 sec
+TOTAL       :     8.275184 sec
 INFO: No Floating Point Exceptions have been reported
-    24,956,731,457      cycles                           #    3.042 GHz                    
-    79,116,565,186      instructions                     #    3.17  insn per cycle         
-       8.208701764 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3586) (avx2:    0) (512y:    0) (512z:    0)
+    24,981,677,575      cycles                           #    3.018 GHz                    
+    79,112,697,083      instructions                     #    3.17  insn per cycle         
+       8.279194518 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3573) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.262600e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.275603e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.275603e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.049042e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.062007e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.062007e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
-TOTAL       :     2.267084 sec
+TOTAL       :     2.331496 sec
 INFO: No Floating Point Exceptions have been reported
-     6,521,430,213      cycles                           #    2.873 GHz                    
-    20,277,954,673      instructions                     #    3.11  insn per cycle         
-       2.273392083 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13797) (avx2:    0) (512y:    0) (512z:    0)
+     6,513,667,582      cycles                           #    2.790 GHz                    
+    20,270,685,743      instructions                     #    3.11  insn per cycle         
+       2.335321002 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13785) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.663561e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.670648e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.670648e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.631322e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.638001e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.638001e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.994752 sec
+TOTAL       :     1.010094 sec
 INFO: No Floating Point Exceptions have been reported
-     2,837,720,578      cycles                           #    2.844 GHz                    
-     7,073,170,279      instructions                     #    2.49  insn per cycle         
-       1.000889784 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12076) (512y:    0) (512z:    0)
+     2,858,902,160      cycles                           #    2.822 GHz                    
+     7,066,281,657      instructions                     #    2.47  insn per cycle         
+       1.013626411 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12058) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.876549e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.885485e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.885485e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.855078e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.863833e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.863833e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.882850 sec
+TOTAL       :     0.888854 sec
 INFO: No Floating Point Exceptions have been reported
-     2,531,723,545      cycles                           #    2.859 GHz                    
-     6,411,241,908      instructions                     #    2.53  insn per cycle         
-       0.889144965 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11050) (512y:   43) (512z:    0)
+     2,514,609,187      cycles                           #    2.820 GHz                    
+     6,403,227,199      instructions                     #    2.55  insn per cycle         
+       0.892442076 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11026) (512y:   43) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.415709e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.420844e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.420844e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.472481e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.477974e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.477974e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.172695 sec
+TOTAL       :     1.118887 sec
 INFO: No Floating Point Exceptions have been reported
-     2,078,810,102      cycles                           #    1.776 GHz                    
-     3,311,309,421      instructions                     #    1.59  insn per cycle         
-       1.179156364 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2615) (512y:   46) (512z: 9609)
+     2,071,045,676      cycles                           #    1.846 GHz                    
+     3,304,181,825      instructions                     #    1.60  insn per cycle         
+       1.122589043 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2591) (512y:   46) (512z: 9609)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt
index 73ac8ddea9..d4f5540c08 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-06-28_20:52:52
+DATE: 2024-08-08_20:20:08
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.333953e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.949066e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.949066e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.362722e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.966550e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.966550e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.048178e+00 +- 2.364571e+00 )  GeV^-4
-TOTAL       :     0.475381 sec
+TOTAL       :     0.475517 sec
 INFO: No Floating Point Exceptions have been reported
-     2,022,586,615      cycles                           #    2.949 GHz                    
-     3,025,069,636      instructions                     #    1.50  insn per cycle         
-       0.744809477 seconds time elapsed
+     2,001,123,741      cycles                           #    2.916 GHz                    
+     3,014,989,818      instructions                     #    1.51  insn per cycle         
+       0.744972192 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -79,18 +79,20 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.011106e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.163446e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.163446e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.951093e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.086269e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.086269e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.641709e+00 +- 4.994248e+00 )  GeV^-4
-TOTAL       :     1.955963 sec
+TOTAL       :     1.963357 sec
 INFO: No Floating Point Exceptions have been reported
-     6,597,079,956      cycles                           #    3.008 GHz                    
-    13,224,291,569      instructions                     #    2.00  insn per cycle         
-       2.249954519 seconds time elapsed
+     6,464,131,212      cycles                           #    2.938 GHz                    
+    13,280,566,465      instructions                     #    2.05  insn per cycle         
+       2.255825453 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -108,20 +110,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.979285e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.980287e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.980287e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.961986e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.962995e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.962995e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.298271 sec
+TOTAL       :     8.366737 sec
 INFO: No Floating Point Exceptions have been reported
-    24,988,096,548      cycles                           #    3.010 GHz                    
-    79,123,467,090      instructions                     #    3.17  insn per cycle         
-       8.302811323 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3586) (avx2:    0) (512y:    0) (512z:    0)
+    25,004,224,949      cycles                           #    2.987 GHz                    
+    79,113,889,000      instructions                     #    3.16  insn per cycle         
+       8.370993372 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3573) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -137,20 +140,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.249110e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.263380e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.263380e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.168882e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.181926e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.181926e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
-TOTAL       :     2.273331 sec
+TOTAL       :     2.295100 sec
 INFO: No Floating Point Exceptions have been reported
-     6,535,487,857      cycles                           #    2.871 GHz                    
-    20,286,980,927      instructions                     #    3.10  insn per cycle         
-       2.277574276 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13797) (avx2:    0) (512y:    0) (512z:    0)
+     6,522,736,001      cycles                           #    2.838 GHz                    
+    20,279,496,113      instructions                     #    3.11  insn per cycle         
+       2.299251518 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13785) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -166,20 +170,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.665095e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.672204e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.672204e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.604472e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.610985e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.610985e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.995964 sec
+TOTAL       :     1.029832 sec
 INFO: No Floating Point Exceptions have been reported
-     2,845,901,940      cycles                           #    2.847 GHz                    
-     7,083,238,590      instructions                     #    2.49  insn per cycle         
-       1.000282605 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12076) (512y:    0) (512z:    0)
+     2,869,187,737      cycles                           #    2.777 GHz                    
+     7,075,475,577      instructions                     #    2.47  insn per cycle         
+       1.033942723 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12058) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -195,20 +200,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.875871e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.884797e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.884797e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.863942e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.872787e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.872787e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.884953 sec
+TOTAL       :     0.887626 sec
 INFO: No Floating Point Exceptions have been reported
-     2,544,773,506      cycles                           #    2.864 GHz                    
-     6,420,481,607      instructions                     #    2.52  insn per cycle         
-       0.889219241 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11050) (512y:   43) (512z:    0)
+     2,527,038,904      cycles                           #    2.836 GHz                    
+     6,413,204,152      instructions                     #    2.54  insn per cycle         
+       0.891739175 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11026) (512y:   43) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -224,20 +230,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.484572e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.489964e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.489964e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.473762e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.479361e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.479361e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.115836 sec
+TOTAL       :     1.120677 sec
 INFO: No Floating Point Exceptions have been reported
-     2,087,800,742      cycles                           #    1.865 GHz                    
-     3,321,180,595      instructions                     #    1.59  insn per cycle         
-       1.120177136 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2615) (512y:   46) (512z: 9609)
+     2,080,597,436      cycles                           #    1.851 GHz                    
+     3,313,716,206      instructions                     #    1.59  insn per cycle         
+       1.124889543 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2591) (512y:   46) (512z: 9609)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt
index 2632c41b0b..2bbd6d0428 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-06-28_21:02:29
+DATE: 2024-08-08_20:30:20
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.991378e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.036667e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.041414e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.027396e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.072992e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.077839e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.159396e-01 +- 3.238803e-01 )  GeV^-4
-TOTAL       :     0.468693 sec
+TOTAL       :     0.472420 sec
 INFO: No Floating Point Exceptions have been reported
-     2,029,615,278      cycles                           #    2.948 GHz                    
-     3,030,247,378      instructions                     #    1.49  insn per cycle         
-       0.745163582 seconds time elapsed
+     2,017,335,926      cycles                           #    2.929 GHz                    
+     2,996,516,741      instructions                     #    1.49  insn per cycle         
+       0.747617629 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --common
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.178274e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.240581e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.243327e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.176066e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.236543e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.239377e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.094367e+02 +- 1.071509e+02 )  GeV^-4
-TOTAL       :     1.866265 sec
+TOTAL       :     1.869944 sec
 INFO: No Floating Point Exceptions have been reported
-     6,299,743,710      cycles                           #    3.010 GHz                    
-    12,600,740,032      instructions                     #    2.00  insn per cycle         
-       2.149831052 seconds time elapsed
+     6,204,679,090      cycles                           #    2.959 GHz                    
+    13,136,993,437      instructions                     #    2.12  insn per cycle         
+       2.155017166 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.998329e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.999303e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.999303e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.981113e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.982134e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.982134e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.208459e-01 +- 3.253446e-01 )  GeV^-4
-TOTAL       :     8.216848 sec
+TOTAL       :     8.283937 sec
 INFO: No Floating Point Exceptions have been reported
-    24,974,321,873      cycles                           #    3.039 GHz                    
-    79,117,922,163      instructions                     #    3.17  insn per cycle         
-       8.220818471 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3586) (avx2:    0) (512y:    0) (512z:    0)
+    24,969,353,482      cycles                           #    3.013 GHz                    
+    79,108,034,680      instructions                     #    3.17  insn per cycle         
+       8.287825380 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3573) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.280961e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.294635e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.294635e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.181056e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.194443e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.194443e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.208457e-01 +- 3.253445e-01 )  GeV^-4
-TOTAL       :     2.261486 sec
+TOTAL       :     2.289520 sec
 INFO: No Floating Point Exceptions have been reported
-     6,522,836,555      cycles                           #    2.880 GHz                    
-    20,276,184,021      instructions                     #    3.11  insn per cycle         
-       2.265504910 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13797) (avx2:    0) (512y:    0) (512z:    0)
+     6,518,141,305      cycles                           #    2.843 GHz                    
+    20,270,157,027      instructions                     #    3.11  insn per cycle         
+       2.293380252 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13785) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.668062e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.674946e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.674946e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.629677e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.636717e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.636717e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.214978e-01 +- 3.255521e-01 )  GeV^-4
-TOTAL       :     0.992017 sec
+TOTAL       :     1.012223 sec
 INFO: No Floating Point Exceptions have been reported
-     2,836,186,752      cycles                           #    2.849 GHz                    
-     7,070,350,824      instructions                     #    2.49  insn per cycle         
-       0.995979730 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12076) (512y:    0) (512z:    0)
+     2,864,292,228      cycles                           #    2.821 GHz                    
+     7,063,008,029      instructions                     #    2.47  insn per cycle         
+       1.016182729 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12058) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.868046e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.876797e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.876797e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.830887e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.839546e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.839546e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.214978e-01 +- 3.255521e-01 )  GeV^-4
-TOTAL       :     0.886960 sec
+TOTAL       :     0.901658 sec
 INFO: No Floating Point Exceptions have been reported
-     2,536,605,252      cycles                           #    2.849 GHz                    
-     6,407,504,392      instructions                     #    2.53  insn per cycle         
-       0.890901975 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11050) (512y:   43) (512z:    0)
+     2,522,018,356      cycles                           #    2.787 GHz                    
+     6,399,988,861      instructions                     #    2.54  insn per cycle         
+       0.905644388 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11026) (512y:   43) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.493995e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.499521e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.499521e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.485210e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.490986e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.490986e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.214981e-01 +- 3.255523e-01 )  GeV^-4
-TOTAL       :     1.107008 sec
+TOTAL       :     1.110909 sec
 INFO: No Floating Point Exceptions have been reported
-     2,080,229,154      cycles                           #    1.874 GHz                    
-     3,307,176,552      instructions                     #    1.59  insn per cycle         
-       1.110975088 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2615) (512y:   46) (512z: 9609)
+     2,072,711,689      cycles                           #    1.860 GHz                    
+     3,301,709,135      instructions                     #    1.59  insn per cycle         
+       1.114884740 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2591) (512y:   46) (512z: 9609)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt
index a8c80db365..687ea21e82 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-06-28_20:59:44
+DATE: 2024-08-08_20:27:32
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.978745e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.024403e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.029368e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.974387e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.019107e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.024136e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.471366 sec
+TOTAL       :     0.465731 sec
 INFO: No Floating Point Exceptions have been reported
-     2,050,817,847      cycles                           #    2.957 GHz                    
-     3,042,700,404      instructions                     #    1.48  insn per cycle         
-       0.752287748 seconds time elapsed
+     1,986,250,676      cycles                           #    2.933 GHz                    
+     2,951,574,048      instructions                     #    1.49  insn per cycle         
+       0.733704221 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.124223e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.185934e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.188655e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.127905e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.186845e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.189533e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.817666 sec
+TOTAL       :     1.821509 sec
 INFO: No Floating Point Exceptions have been reported
-     6,209,571,440      cycles                           #    3.020 GHz                    
-    12,616,264,854      instructions                     #    2.03  insn per cycle         
-       2.112773344 seconds time elapsed
+     6,099,068,812      cycles                           #    2.975 GHz                    
+    13,255,673,376      instructions                     #    2.17  insn per cycle         
+       2.106639688 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.001801e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.002828e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.002828e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.982878e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.983848e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.983848e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.201054 sec
+TOTAL       :     8.276232 sec
 INFO: No Floating Point Exceptions have been reported
-    24,965,655,332      cycles                           #    3.043 GHz                    
-    79,115,538,738      instructions                     #    3.17  insn per cycle         
-       8.205177482 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3586) (avx2:    0) (512y:    0) (512z:    0)
+    24,992,064,451      cycles                           #    3.019 GHz                    
+    79,108,890,354      instructions                     #    3.17  insn per cycle         
+       8.280274971 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3573) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.122075e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.140371e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.140371e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.180915e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.194829e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.194829e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
-TOTAL       :     2.312173 sec
+TOTAL       :     2.288781 sec
 INFO: No Floating Point Exceptions have been reported
-     6,529,449,643      cycles                           #    2.822 GHz                    
-    20,279,014,819      instructions                     #    3.11  insn per cycle         
-       2.317005613 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13797) (avx2:    0) (512y:    0) (512z:    0)
+     6,519,434,997      cycles                           #    2.844 GHz                    
+    20,271,064,648      instructions                     #    3.11  insn per cycle         
+       2.292801258 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13785) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.671295e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.678689e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.678689e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.639199e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.645912e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.645912e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.989320 sec
+TOTAL       :     1.005317 sec
 INFO: No Floating Point Exceptions have been reported
-     2,833,882,819      cycles                           #    2.855 GHz                    
-     7,072,820,246      instructions                     #    2.50  insn per cycle         
-       0.993299246 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12076) (512y:    0) (512z:    0)
+     2,861,574,039      cycles                           #    2.837 GHz                    
+     7,065,482,922      instructions                     #    2.47  insn per cycle         
+       1.009367222 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12058) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.845765e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.854604e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.854604e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.841221e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.849583e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.849583e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.896647 sec
+TOTAL       :     0.895518 sec
 INFO: No Floating Point Exceptions have been reported
-     2,532,364,843      cycles                           #    2.814 GHz                    
-     6,410,761,496      instructions                     #    2.53  insn per cycle         
-       0.900735690 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11050) (512y:   43) (512z:    0)
+     2,517,844,676      cycles                           #    2.802 GHz                    
+     6,403,839,691      instructions                     #    2.54  insn per cycle         
+       0.899537508 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11026) (512y:   43) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.479281e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.485002e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.485002e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.455203e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.460404e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.460404e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.116768 sec
+TOTAL       :     1.132212 sec
 INFO: No Floating Point Exceptions have been reported
-     2,077,875,816      cycles                           #    1.855 GHz                    
-     3,310,773,706      instructions                     #    1.59  insn per cycle         
-       1.120857530 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2615) (512y:   46) (512z: 9609)
+     2,067,552,649      cycles                           #    1.821 GHz                    
+     3,303,460,015      instructions                     #    1.60  insn per cycle         
+       1.136266053 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2591) (512y:   46) (512z: 9609)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt
index df70aedb33..5238dd29f1 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-06-28_20:57:05
+DATE: 2024-08-08_20:24:48
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -50,15 +50,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.434560e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.027161e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.032207e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.461156e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.032316e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.037418e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.048178e+00 +- 2.364571e+00 )  GeV^-4
-TOTAL       :     0.472410 sec
+TOTAL       :     0.471716 sec
 INFO: No Floating Point Exceptions have been reported
-     2,024,542,487      cycles                           #    2.941 GHz                    
-     2,972,484,593      instructions                     #    1.47  insn per cycle         
-       0.746922396 seconds time elapsed
+     2,015,572,444      cycles                           #    2.959 GHz                    
+     3,048,101,818      instructions                     #    1.51  insn per cycle         
+       0.739787706 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
@@ -70,18 +70,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.188309e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.266321e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.269256e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.217590e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.274346e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.276990e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.641709e+00 +- 4.994248e+00 )  GeV^-4
-TOTAL       :     1.895330 sec
+TOTAL       :     1.888870 sec
 INFO: No Floating Point Exceptions have been reported
-     6,131,858,588      cycles                           #    2.889 GHz                    
-    13,047,146,236      instructions                     #    2.13  insn per cycle         
-       2.180256479 seconds time elapsed
+     6,296,963,935      cycles                           #    2.979 GHz                    
+    13,479,190,689      instructions                     #    2.14  insn per cycle         
+       2.172551421 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -98,20 +100,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.010951e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.011958e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.011958e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.967176e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.968130e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.968130e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.164059 sec
+TOTAL       :     8.342097 sec
 INFO: No Floating Point Exceptions have been reported
-    24,967,412,253      cycles                           #    3.057 GHz                    
-    79,119,765,586      instructions                     #    3.17  insn per cycle         
-       8.168055309 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3586) (avx2:    0) (512y:    0) (512z:    0)
+    24,950,965,102      cycles                           #    2.990 GHz                    
+    79,109,236,780      instructions                     #    3.17  insn per cycle         
+       8.346055445 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3573) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -126,20 +129,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.311869e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.325581e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.325581e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.089881e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.103174e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.103174e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
-TOTAL       :     2.250985 sec
+TOTAL       :     2.317816 sec
 INFO: No Floating Point Exceptions have been reported
-     6,523,933,882      cycles                           #    2.894 GHz                    
-    20,278,721,313      instructions                     #    3.11  insn per cycle         
-       2.255081666 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13797) (avx2:    0) (512y:    0) (512z:    0)
+     6,512,194,963      cycles                           #    2.805 GHz                    
+    20,270,944,427      instructions                     #    3.11  insn per cycle         
+       2.322212487 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13785) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -154,20 +158,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.665544e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.672629e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.672629e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.538805e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.544913e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.544913e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.992754 sec
+TOTAL       :     1.070841 sec
 INFO: No Floating Point Exceptions have been reported
-     2,835,266,942      cycles                           #    2.846 GHz                    
-     7,072,901,733      instructions                     #    2.49  insn per cycle         
-       0.996781737 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12076) (512y:    0) (512z:    0)
+     2,864,836,878      cycles                           #    2.667 GHz                    
+     7,066,173,206      instructions                     #    2.47  insn per cycle         
+       1.075040197 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12058) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -182,20 +187,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.852270e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.861343e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.861343e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.841038e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.849527e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.849527e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.893815 sec
+TOTAL       :     0.895722 sec
 INFO: No Floating Point Exceptions have been reported
-     2,533,769,116      cycles                           #    2.823 GHz                    
-     6,410,639,055      instructions                     #    2.53  insn per cycle         
-       0.898045028 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11050) (512y:   43) (512z:    0)
+     2,515,535,185      cycles                           #    2.798 GHz                    
+     6,403,562,449      instructions                     #    2.55  insn per cycle         
+       0.899557326 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11026) (512y:   43) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -210,20 +216,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.489228e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.494808e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.494808e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.475627e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.481124e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.481124e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.109565 sec
+TOTAL       :     1.116628 sec
 INFO: No Floating Point Exceptions have been reported
-     2,077,805,915      cycles                           #    1.867 GHz                    
-     3,310,655,645      instructions                     #    1.59  insn per cycle         
-       1.113619813 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2615) (512y:   46) (512z: 9609)
+     2,068,334,570      cycles                           #    1.847 GHz                    
+     3,303,479,670      instructions                     #    1.60  insn per cycle         
+       1.120666931 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2591) (512y:   46) (512z: 9609)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt
index d204cfd6c4..498b2cd37c 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-06-28_20:23:43
+DATE: 2024-08-08_19:56:56
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.984175e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.025153e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.029869e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.966632e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.010698e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.016169e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.487139 sec
+TOTAL       :     0.489605 sec
 INFO: No Floating Point Exceptions have been reported
-     2,063,500,473      cycles                           #    2.945 GHz                    
-     3,107,238,252      instructions                     #    1.51  insn per cycle         
-       0.908009925 seconds time elapsed
+     2,010,594,089      cycles                           #    2.844 GHz                    
+     3,012,973,454      instructions                     #    1.50  insn per cycle         
+       0.767009476 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.200654e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.259252e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.261790e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.185325e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.243689e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.246525e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.784294 sec
+TOTAL       :     1.784742 sec
 INFO: No Floating Point Exceptions have been reported
-     6,058,623,406      cycles                           #    3.012 GHz                    
-    12,200,781,654      instructions                     #    2.01  insn per cycle         
-       2.070363274 seconds time elapsed
+     6,010,360,971      cycles                           #    2.981 GHz                    
+    12,082,269,886      instructions                     #    2.01  insn per cycle         
+       2.072759359 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.933157e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.934098e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.934098e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.982152e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.983118e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.983118e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.493120 sec
+TOTAL       :     8.279488 sec
 INFO: No Floating Point Exceptions have been reported
-    24,883,798,967      cycles                           #    2.929 GHz                    
-    78,851,332,915      instructions                     #    3.17  insn per cycle         
-       8.499380928 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3106) (avx2:    0) (512y:    0) (512z:    0)
+    24,906,847,273      cycles                           #    3.008 GHz                    
+    78,843,477,297      instructions                     #    3.17  insn per cycle         
+       8.283438125 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3093) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.076928e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.090345e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.090345e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.430488e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.444488e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.444488e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
-TOTAL       :     2.325308 sec
+TOTAL       :     2.211830 sec
 INFO: No Floating Point Exceptions have been reported
-     6,469,756,685      cycles                           #    2.778 GHz                    
-    20,237,393,653      instructions                     #    3.13  insn per cycle         
-       2.332670420 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13509) (avx2:    0) (512y:    0) (512z:    0)
+     6,461,373,436      cycles                           #    2.917 GHz                    
+    20,229,460,939      instructions                     #    3.13  insn per cycle         
+       2.215383125 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13497) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.522202e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.528155e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.528155e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.546141e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.552346e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.552346e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     1.086389 sec
+TOTAL       :     1.065436 sec
 INFO: No Floating Point Exceptions have been reported
-     2,987,340,396      cycles                           #    2.741 GHz                    
-     7,214,189,758      instructions                     #    2.41  insn per cycle         
-       1.094660294 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12458) (512y:    0) (512z:    0)
+     2,970,223,700      cycles                           #    2.780 GHz                    
+     7,206,483,333      instructions                     #    2.43  insn per cycle         
+       1.069132793 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12440) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.732912e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.740937e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.740937e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.798890e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.807066e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.807066e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.954309 sec
+TOTAL       :     0.916539 sec
 INFO: No Floating Point Exceptions have been reported
-     2,617,388,524      cycles                           #    2.732 GHz                    
-     6,551,995,701      instructions                     #    2.50  insn per cycle         
-       0.961130408 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11478) (512y:   26) (512z:    0)
+     2,599,305,235      cycles                           #    2.826 GHz                    
+     6,544,414,590      instructions                     #    2.52  insn per cycle         
+       0.920171410 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11454) (512y:   26) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.345824e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.350418e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.350418e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.428262e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.433365e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.433365e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.226707 sec
+TOTAL       :     1.153100 sec
 INFO: No Floating Point Exceptions have been reported
-     2,155,235,101      cycles                           #    1.751 GHz                    
-     3,469,153,346      instructions                     #    1.61  insn per cycle         
-       1.233547264 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3051) (512y:   25) (512z: 9681)
+     2,140,036,710      cycles                           #    1.851 GHz                    
+     3,461,118,107      instructions                     #    1.62  insn per cycle         
+       1.156674320 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3027) (512y:   25) (512z: 9681)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt
index 4308c5ba18..dc9ca7a530 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-06-28_20:45:14
+DATE: 2024-08-08_20:12:32
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.048358e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.095472e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.100450e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.067673e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.110658e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.115133e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059597e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.488485 sec
+TOTAL       :     0.487879 sec
 INFO: No Floating Point Exceptions have been reported
-     2,077,018,410      cycles                           #    2.955 GHz                    
-     3,109,798,046      instructions                     #    1.50  insn per cycle         
-       0.764182373 seconds time elapsed
+     2,053,159,539      cycles                           #    2.919 GHz                    
+     3,075,135,999      instructions                     #    1.50  insn per cycle         
+       0.764389501 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.654005e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.729790e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.733030e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.681005e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.744501e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.747278e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.728242 sec
+TOTAL       :     1.731074 sec
 INFO: No Floating Point Exceptions have been reported
-     5,873,252,538      cycles                           #    2.998 GHz                    
-    11,848,783,670      instructions                     #    2.02  insn per cycle         
-       2.017968081 seconds time elapsed
+     5,778,197,761      cycles                           #    2.951 GHz                    
+    12,437,674,784      instructions                     #    2.15  insn per cycle         
+       2.017655879 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2
@@ -95,26 +97,27 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.634141e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.634934e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.634934e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.722501e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.723307e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.723307e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059969e+00 +- 2.367799e+00 )  GeV^-4
-TOTAL       :    29.116821 sec
+TOTAL       :    28.664558 sec
 INFO: No Floating Point Exceptions have been reported
-    88,296,563,936      cycles                           #    3.033 GHz                    
-   135,724,094,332      instructions                     #    1.54  insn per cycle         
-      29.120991306 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:15654) (avx2:    0) (512y:    0) (512z:    0)
+    85,759,268,786      cycles                           #    2.992 GHz                    
+   135,287,125,941      instructions                     #    1.58  insn per cycle         
+      28.668460894 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:15198) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627535e-04
-Avg ME (F77/C++)    = 6.6275351083142087E-004
-Relative difference = 1.6343060926412837e-08
+Avg ME (F77/C++)    = 6.6275351218394313E-004
+Relative difference = 1.8383823081355348e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.056966e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.069695e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.069695e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.988288e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.001222e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.001222e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059962e+00 +- 2.367792e+00 )  GeV^-4
-TOTAL       :     2.332176 sec
+TOTAL       :     2.351494 sec
 INFO: No Floating Point Exceptions have been reported
-     6,779,570,560      cycles                           #    2.903 GHz                    
-    19,363,467,868      instructions                     #    2.86  insn per cycle         
-       2.336362830 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:69602) (avx2:    0) (512y:    0) (512z:    0)
+     6,754,834,567      cycles                           #    2.869 GHz                    
+    19,356,472,261      instructions                     #    2.87  insn per cycle         
+       2.355469886 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:69590) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.493626e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.499294e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.499294e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.466081e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.471571e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.471571e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367377e+00 )  GeV^-4
-TOTAL       :     1.106963 sec
+TOTAL       :     1.123603 sec
 INFO: No Floating Point Exceptions have been reported
-     3,165,795,966      cycles                           #    2.853 GHz                    
-     6,799,095,089      instructions                     #    2.15  insn per cycle         
-       1.111060477 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:49016) (512y:    0) (512z:    0)
+     3,163,501,117      cycles                           #    2.807 GHz                    
+     6,791,828,071      instructions                     #    2.15  insn per cycle         
+       1.127610138 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:48998) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.793077e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.801261e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.801261e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.760032e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.767850e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.767850e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367377e+00 )  GeV^-4
-TOTAL       :     0.922630 sec
+TOTAL       :     0.936650 sec
 INFO: No Floating Point Exceptions have been reported
-     2,642,708,250      cycles                           #    2.854 GHz                    
-     5,977,492,021      instructions                     #    2.26  insn per cycle         
-       0.926708959 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:42613) (512y:   11) (512z:    0)
+     2,623,882,438      cycles                           #    2.794 GHz                    
+     5,969,895,302      instructions                     #    2.28  insn per cycle         
+       0.940643059 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:42589) (512y:   11) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.420261e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.425283e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.425283e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.479077e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.484827e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.484827e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060905e+00 +- 2.367377e+00 )  GeV^-4
-TOTAL       :     1.163603 sec
+TOTAL       :     1.113882 sec
 INFO: No Floating Point Exceptions have been reported
-     2,081,277,351      cycles                           #    1.784 GHz                    
-     3,501,680,018      instructions                     #    1.68  insn per cycle         
-       1.167704285 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5210) (512y:    3) (512z:44834)
+     2,068,747,571      cycles                           #    1.851 GHz                    
+     3,493,400,176      instructions                     #    1.69  insn per cycle         
+       1.117954016 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5186) (512y:    3) (512z:44834)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt
index bdfa627e42..df0f71d174 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-06-28_20:46:02
+DATE: 2024-08-08_20:13:21
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.099021e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.152119e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.157226e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.128808e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.173626e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.178585e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059597e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.484270 sec
+TOTAL       :     0.487050 sec
 INFO: No Floating Point Exceptions have been reported
-     2,083,532,002      cycles                           #    2.960 GHz                    
-     3,101,650,257      instructions                     #    1.49  insn per cycle         
-       0.760655028 seconds time elapsed
+     2,067,516,202      cycles                           #    2.920 GHz                    
+     3,084,461,624      instructions                     #    1.49  insn per cycle         
+       0.767079444 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.768383e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.846177e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.849534e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.729947e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.794330e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.797099e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.709256 sec
+TOTAL       :     1.715330 sec
 INFO: No Floating Point Exceptions have been reported
-     5,845,346,990      cycles                           #    3.011 GHz                    
-    12,438,318,638      instructions                     #    2.13  insn per cycle         
-       1.998774694 seconds time elapsed
+     5,790,416,249      cycles                           #    2.963 GHz                    
+    12,405,778,334      instructions                     #    2.14  insn per cycle         
+       2.012725573 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2
@@ -95,26 +97,27 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.712703e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.713504e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.713504e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.739276e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.740108e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.740108e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059969e+00 +- 2.367799e+00 )  GeV^-4
-TOTAL       :    28.715532 sec
+TOTAL       :    28.579010 sec
 INFO: No Floating Point Exceptions have been reported
-    86,154,710,827      cycles                           #    3.006 GHz                    
-   135,583,041,000      instructions                     #    1.57  insn per cycle         
-      28.719661195 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:15696) (avx2:    0) (512y:    0) (512z:    0)
+    85,869,035,147      cycles                           #    3.005 GHz                    
+   135,713,098,525      instructions                     #    1.58  insn per cycle         
+      28.582934987 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:15490) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627535e-04
-Avg ME (F77/C++)    = 6.6275346699767868E-004
-Relative difference = 4.979577076821206e-08
+Avg ME (F77/C++)    = 6.6275349723624727E-004
+Relative difference = 4.170106635889315e-09
 OK (relative difference <= 5E-3)
 =========================================================================
 runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.004558e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.017373e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.017373e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.656997e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.668108e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.668108e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059962e+00 +- 2.367792e+00 )  GeV^-4
-TOTAL       :     2.349549 sec
+TOTAL       :     2.468183 sec
 INFO: No Floating Point Exceptions have been reported
-     6,843,302,569      cycles                           #    2.909 GHz                    
-    19,413,787,446      instructions                     #    2.84  insn per cycle         
-       2.353799328 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:69633) (avx2:    0) (512y:    0) (512z:    0)
+     6,838,146,467      cycles                           #    2.767 GHz                    
+    19,407,163,330      instructions                     #    2.84  insn per cycle         
+       2.472172726 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:69621) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.499299e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.504925e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.504925e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.494743e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.500456e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.500456e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367377e+00 )  GeV^-4
-TOTAL       :     1.101876 sec
+TOTAL       :     1.101868 sec
 INFO: No Floating Point Exceptions have been reported
-     3,116,540,240      cycles                           #    2.820 GHz                    
-     6,722,785,902      instructions                     #    2.16  insn per cycle         
-       1.105922589 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:47703) (512y:    0) (512z:    0)
+     3,102,166,074      cycles                           #    2.807 GHz                    
+     6,715,779,639      instructions                     #    2.16  insn per cycle         
+       1.105919768 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:47685) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.797776e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.805926e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.805926e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.757205e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.764907e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.764907e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367377e+00 )  GeV^-4
-TOTAL       :     0.920074 sec
+TOTAL       :     0.937783 sec
 INFO: No Floating Point Exceptions have been reported
-     2,632,426,467      cycles                           #    2.851 GHz                    
-     5,975,861,662      instructions                     #    2.27  insn per cycle         
-       0.924164164 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:41894) (512y:   13) (512z:    0)
+     2,624,045,983      cycles                           #    2.788 GHz                    
+     5,968,641,196      instructions                     #    2.27  insn per cycle         
+       0.941620580 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:41870) (512y:   13) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.503457e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.509298e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.509298e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.475717e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.481089e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.481089e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060905e+00 +- 2.367377e+00 )  GeV^-4
-TOTAL       :     1.098991 sec
+TOTAL       :     1.116160 sec
 INFO: No Floating Point Exceptions have been reported
-     2,074,398,072      cycles                           #    1.881 GHz                    
-     3,493,830,287      instructions                     #    1.68  insn per cycle         
-       1.103135228 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4174) (512y:    4) (512z:44485)
+     2,072,491,943      cycles                           #    1.851 GHz                    
+     3,486,963,775      instructions                     #    1.68  insn per cycle         
+       1.120311238 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4150) (512y:    4) (512z:44485)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
index 37791bd44c..f906b484d1 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-06-28_20:24:08
+DATE: 2024-08-08_19:57:21
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.466437e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.490895e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.492995e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.456351e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.482973e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.485002e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.528024 sec
+TOTAL       :     0.527206 sec
 INFO: No Floating Point Exceptions have been reported
-     2,167,248,013      cycles                           #    2.842 GHz                    
-     3,408,911,296      instructions                     #    1.57  insn per cycle         
-       1.059002803 seconds time elapsed
+     2,263,706,765      cycles                           #    2.945 GHz                    
+     3,529,595,149      instructions                     #    1.56  insn per cycle         
+       0.828954022 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.140196e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.168579e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.169818e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.128784e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.158212e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.159533e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.031191 sec
+TOTAL       :     3.057239 sec
 INFO: No Floating Point Exceptions have been reported
-     9,553,855,695      cycles                           #    2.907 GHz                    
-    20,366,578,683      instructions                     #    2.13  insn per cycle         
-       3.341274220 seconds time elapsed
+     9,783,417,122      cycles                           #    2.925 GHz                    
+    13,211,264,053      instructions                     #    1.35  insn per cycle         
+       3.405402734 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.843122e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.844017e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.844017e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.903780e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.904695e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.904695e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.908083 sec
+TOTAL       :     8.621676 sec
 INFO: No Floating Point Exceptions have been reported
-    25,993,175,975      cycles                           #    2.917 GHz                    
-    79,438,748,307      instructions                     #    3.06  insn per cycle         
-       8.914670521 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 4789) (avx2:    0) (512y:    0) (512z:    0)
+    25,964,721,381      cycles                           #    3.010 GHz                    
+    79,427,591,787      instructions                     #    3.06  insn per cycle         
+       8.626023484 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 4776) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.477942e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.481153e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.481153e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.603827e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.607327e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.607327e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.725780 sec
+TOTAL       :     4.557849 sec
 INFO: No Floating Point Exceptions have been reported
-    12,846,841,244      cycles                           #    2.717 GHz                    
-    38,833,705,303      instructions                     #    3.02  insn per cycle         
-       4.732362911 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13184) (avx2:    0) (512y:    0) (512z:    0)
+    12,814,190,735      cycles                           #    2.810 GHz                    
+    38,825,158,190      instructions                     #    3.03  insn per cycle         
+       4.561789335 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13172) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.084067e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.100651e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.100651e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.224833e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.241665e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.241665e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.038475 sec
+TOTAL       :     2.000761 sec
 INFO: No Floating Point Exceptions have been reported
-     5,582,099,770      cycles                           #    2.734 GHz                    
-    13,625,182,795      instructions                     #    2.44  insn per cycle         
-       2.046074509 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11433) (512y:    0) (512z:    0)
+     5,588,116,210      cycles                           #    2.789 GHz                    
+    13,618,090,861      instructions                     #    2.44  insn per cycle         
+       2.004606328 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11415) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.157339e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.179034e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.179034e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.076409e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.097653e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.097653e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.800984 sec
+TOTAL       :     1.813694 sec
 INFO: No Floating Point Exceptions have been reported
-     4,872,138,012      cycles                           #    2.701 GHz                    
-    12,304,394,482      instructions                     #    2.53  insn per cycle         
-       1.808284460 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10343) (512y:   79) (512z:    0)
+     4,900,228,417      cycles                           #    2.697 GHz                    
+    12,298,153,916      instructions                     #    2.51  insn per cycle         
+       1.817598978 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10319) (512y:   79) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.942926e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.955101e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.955101e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.275673e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.288563e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.288563e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.372771 sec
+TOTAL       :     2.261390 sec
 INFO: No Floating Point Exceptions have been reported
-     4,182,201,238      cycles                           #    1.761 GHz                    
-     6,398,643,943      instructions                     #    1.53  insn per cycle         
-       2.378723749 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1981) (512y:   93) (512z: 9359)
+     4,176,196,803      cycles                           #    1.844 GHz                    
+     6,391,790,037      instructions                     #    1.53  insn per cycle         
+       2.265279894 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1957) (512y:   93) (512z: 9359)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt
index ef7bf47569..965f537970 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-06-28_20:24:42
+DATE: 2024-08-08_19:57:54
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.471540e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.495762e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.497819e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.478905e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.505299e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.507625e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.530504 sec
+TOTAL       :     0.523820 sec
 INFO: No Floating Point Exceptions have been reported
-     2,208,819,005      cycles                           #    2.919 GHz                    
-     3,459,394,113      instructions                     #    1.57  insn per cycle         
-       0.990710147 seconds time elapsed
+     2,217,657,303      cycles                           #    2.936 GHz                    
+     3,422,937,672      instructions                     #    1.54  insn per cycle         
+       0.814906080 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.128166e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.156082e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.157268e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.142523e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.171945e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.173230e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.021368 sec
+TOTAL       :     3.034284 sec
 INFO: No Floating Point Exceptions have been reported
-     9,847,196,311      cycles                           #    3.007 GHz                    
-    20,896,495,636      instructions                     #    2.12  insn per cycle         
-       3.330773882 seconds time elapsed
+     9,867,106,252      cycles                           #    2.970 GHz                    
+    19,377,940,372      instructions                     #    1.96  insn per cycle         
+       3.381320729 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.904896e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.905828e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.905828e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.898812e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.899704e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.899704e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.618971 sec
+TOTAL       :     8.643841 sec
 INFO: No Floating Point Exceptions have been reported
-    26,017,566,799      cycles                           #    3.018 GHz                    
-    79,463,058,396      instructions                     #    3.05  insn per cycle         
-       8.625612789 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 4445) (avx2:    0) (512y:    0) (512z:    0)
+    26,013,311,554      cycles                           #    3.009 GHz                    
+    79,457,517,298      instructions                     #    3.05  insn per cycle         
+       8.647992970 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 4432) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.466998e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.470121e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.470121e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.611561e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.614888e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.614888e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.740884 sec
+TOTAL       :     4.547888 sec
 INFO: No Floating Point Exceptions have been reported
-    12,818,499,465      cycles                           #    2.706 GHz                    
-    38,787,565,320      instructions                     #    3.03  insn per cycle         
-       4.748209539 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:12946) (avx2:    0) (512y:    0) (512z:    0)
+    12,837,773,076      cycles                           #    2.821 GHz                    
+    38,782,082,140      instructions                     #    3.02  insn per cycle         
+       4.551612597 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:12934) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.954476e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.970429e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.970429e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.352238e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.369622e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.369622e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.071432 sec
+TOTAL       :     1.970486 sec
 INFO: No Floating Point Exceptions have been reported
-     5,606,087,504      cycles                           #    2.701 GHz                    
-    13,739,354,094      instructions                     #    2.45  insn per cycle         
-       2.079581267 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11516) (512y:    0) (512z:    0)
+     5,585,325,981      cycles                           #    2.830 GHz                    
+    13,732,293,539      instructions                     #    2.46  insn per cycle         
+       1.974370273 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11498) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.052340e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.072919e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.072919e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.400061e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.421825e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.421825e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.820988 sec
+TOTAL       :     1.751328 sec
 INFO: No Floating Point Exceptions have been reported
-     4,962,658,899      cycles                           #    2.719 GHz                    
-    12,428,563,963      instructions                     #    2.50  insn per cycle         
-       1.828126764 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10334) (512y:  239) (512z:    0)
+     4,952,817,402      cycles                           #    2.822 GHz                    
+    12,422,492,733      instructions                     #    2.51  insn per cycle         
+       1.755554143 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10310) (512y:  239) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.784333e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.797005e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.797005e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.219259e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.232248e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.232248e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.427429 sec
+TOTAL       :     2.278823 sec
 INFO: No Floating Point Exceptions have been reported
-     4,189,591,829      cycles                           #    1.722 GHz                    
-     6,503,095,126      instructions                     #    1.55  insn per cycle         
-       2.438290310 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1804) (512y:  191) (512z: 9368)
+     4,182,901,935      cycles                           #    1.833 GHz                    
+     6,495,418,480      instructions                     #    1.55  insn per cycle         
+       2.282695112 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1780) (512y:  191) (512z: 9368)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
index 881adcbbb3..69ee294d0a 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2024-06-28_20:26:32
+DATE: 2024-08-08_19:59:44
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.064817e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.065212e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.065427e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.065566e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.065949e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.066073e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     2.429437 sec
+TOTAL       :     2.441334 sec
 INFO: No Floating Point Exceptions have been reported
-     8,296,328,617      cycles                           #    3.016 GHz                    
-    17,339,087,580      instructions                     #    2.09  insn per cycle         
-       2.815159518 seconds time elapsed
+     8,270,107,004      cycles                           #    2.987 GHz                    
+    17,474,421,900      instructions                     #    2.11  insn per cycle         
+       2.824451613 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.222997e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.224884e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.225172e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.242290e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.244758e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.245006e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     4.002446 sec
+TOTAL       :     4.011109 sec
 INFO: No Floating Point Exceptions have been reported
-    13,014,244,174      cycles                           #    3.008 GHz                    
-    30,562,795,013      instructions                     #    2.35  insn per cycle         
-       4.387764634 seconds time elapsed
+    12,991,708,385      cycles                           #    2.995 GHz                    
+    30,957,069,887      instructions                     #    2.38  insn per cycle         
+       4.393935391 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.942411e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.942623e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.942623e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.391032e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.391286e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.391286e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.659863 sec
+TOTAL       :     6.292298 sec
 INFO: No Floating Point Exceptions have been reported
-    18,952,576,848      cycles                           #    2.848 GHz                    
-    53,913,437,622      instructions                     #    2.84  insn per cycle         
-       6.666065835 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:32438) (avx2:    0) (512y:    0) (512z:    0)
+    18,909,993,943      cycles                           #    3.004 GHz                    
+    53,904,007,557      instructions                     #    2.85  insn per cycle         
+       6.296177339 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:32425) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.646263e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.646555e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.646555e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.592148e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.592238e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.592238e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     3.258166 sec
+TOTAL       :     3.319128 sec
 INFO: No Floating Point Exceptions have been reported
-     9,815,076,926      cycles                           #    3.034 GHz                    
-    27,160,713,930      instructions                     #    2.77  insn per cycle         
-       3.265054274 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:96511) (avx2:    0) (512y:    0) (512z:    0)
+     9,961,985,828      cycles                           #    2.999 GHz                    
+    27,151,879,178      instructions                     #    2.73  insn per cycle         
+       3.323113942 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:96499) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.513827e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.514244e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.514244e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.420642e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.421042e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.421042e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.513778 sec
+TOTAL       :     1.544804 sec
 INFO: No Floating Point Exceptions have been reported
-     4,295,484,479      cycles                           #    2.840 GHz                    
-     9,598,007,558      instructions                     #    2.23  insn per cycle         
-       1.520024006 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:84989) (512y:    0) (512z:    0)
+     4,330,644,690      cycles                           #    2.797 GHz                    
+     9,589,874,862      instructions                     #    2.21  insn per cycle         
+       1.548809848 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:84971) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.021787e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.022321e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.022321e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.965040e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.965659e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.965659e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.323232 sec
+TOTAL       :     1.333170 sec
 INFO: No Floating Point Exceptions have been reported
-     3,750,332,594      cycles                           #    2.837 GHz                    
-     8,522,076,942      instructions                     #    2.27  insn per cycle         
-       1.329385160 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:80643) (512y:   89) (512z:    0)
+     3,730,547,974      cycles                           #    2.792 GHz                    
+     8,513,850,652      instructions                     #    2.28  insn per cycle         
+       1.336769828 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:80619) (512y:   89) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.657338e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.657868e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.657868e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.618586e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.619123e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.619123e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.450577 sec
+TOTAL       :     1.462675 sec
 INFO: No Floating Point Exceptions have been reported
-     2,701,855,108      cycles                           #    1.860 GHz                    
-     4,288,620,965      instructions                     #    1.59  insn per cycle         
-       1.454564839 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2876) (512y:  103) (512z:79119)
+     2,695,334,241      cycles                           #    1.839 GHz                    
+     4,280,276,658      instructions                     #    1.59  insn per cycle         
+       1.466339679 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2852) (512y:  103) (512z:79119)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt
index 1c1653b55c..e1baa342f4 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2024-06-28_20:53:17
+DATE: 2024-08-08_20:20:33
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gp
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.071657e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.072586e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.072586e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.064923e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.065845e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.065845e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     2.389807 sec
+TOTAL       :     2.386081 sec
 INFO: No Floating Point Exceptions have been reported
-     8,166,677,370      cycles                           #    3.014 GHz                    
-    17,110,502,600      instructions                     #    2.10  insn per cycle         
-       2.768730521 seconds time elapsed
+     8,068,364,516      cycles                           #    2.980 GHz                    
+    18,499,320,498      instructions                     #    2.29  insn per cycle         
+       2.766222042 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -79,18 +79,20 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.239728e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.274140e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.274140e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.216459e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.248148e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.248148e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     3.989987 sec
+TOTAL       :     3.985205 sec
 INFO: No Floating Point Exceptions have been reported
-    12,984,716,355      cycles                           #    3.010 GHz                    
-    30,026,547,711      instructions                     #    2.31  insn per cycle         
-       4.369885993 seconds time elapsed
+    12,879,401,549      cycles                           #    2.982 GHz                    
+    28,276,545,925      instructions                     #    2.20  insn per cycle         
+       4.377652629 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -108,20 +110,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.075924e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.076143e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.076143e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.400950e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.401188e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.401188e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.545095 sec
+TOTAL       :     6.287943 sec
 INFO: No Floating Point Exceptions have been reported
-    18,993,218,804      cycles                           #    2.901 GHz                    
-    53,909,508,309      instructions                     #    2.84  insn per cycle         
-       6.549040643 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:32438) (avx2:    0) (512y:    0) (512z:    0)
+    18,917,133,316      cycles                           #    3.007 GHz                    
+    53,900,822,413      instructions                     #    2.85  insn per cycle         
+       6.291810989 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:32425) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -137,20 +140,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.634824e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.634915e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.634915e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.588454e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.588541e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.588541e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     3.239041 sec
+TOTAL       :     3.326167 sec
 INFO: No Floating Point Exceptions have been reported
-     9,844,142,959      cycles                           #    3.038 GHz                    
-    27,159,907,056      instructions                     #    2.76  insn per cycle         
-       3.243158367 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:96511) (avx2:    0) (512y:    0) (512z:    0)
+     9,981,726,497      cycles                           #    2.998 GHz                    
+    27,151,411,979      instructions                     #    2.72  insn per cycle         
+       3.330120405 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:96499) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -166,20 +170,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.522862e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.523280e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.523280e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.463521e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.463922e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.463922e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.509038 sec
+TOTAL       :     1.526941 sec
 INFO: No Floating Point Exceptions have been reported
-     4,285,489,952      cycles                           #    2.833 GHz                    
-     9,598,013,083      instructions                     #    2.24  insn per cycle         
-       1.513170310 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:84989) (512y:    0) (512z:    0)
+     4,301,902,923      cycles                           #    2.811 GHz                    
+     9,590,835,987      instructions                     #    2.23  insn per cycle         
+       1.530966019 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:84971) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -195,20 +200,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.062060e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.062617e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.062617e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.003469e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.004081e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.004081e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.305600 sec
+TOTAL       :     1.322167 sec
 INFO: No Floating Point Exceptions have been reported
-     3,731,786,493      cycles                           #    2.852 GHz                    
-     8,522,102,351      instructions                     #    2.28  insn per cycle         
-       1.309705466 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:80643) (512y:   89) (512z:    0)
+     3,729,352,964      cycles                           #    2.814 GHz                    
+     8,515,368,436      instructions                     #    2.28  insn per cycle         
+       1.326036505 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:80619) (512y:   89) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -224,20 +230,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.604372e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.604990e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.604990e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.565416e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.566063e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.566063e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.470888 sec
+TOTAL       :     1.483865 sec
 INFO: No Floating Point Exceptions have been reported
-     2,708,113,908      cycles                           #    1.837 GHz                    
-     4,288,944,422      instructions                     #    1.58  insn per cycle         
-       1.474912429 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2876) (512y:  103) (512z:79119)
+     2,695,897,083      cycles                           #    1.813 GHz                    
+     4,281,463,157      instructions                     #    1.59  insn per cycle         
+       1.487939257 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2852) (512y:  103) (512z:79119)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt
index 657598560e..618d256396 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2024-06-28_20:27:25
+DATE: 2024-08-08_20:00:52
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.060868e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.061271e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.061430e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.058227e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.058613e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.058749e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     2.430160 sec
+TOTAL       :     2.446864 sec
 INFO: No Floating Point Exceptions have been reported
-     8,275,897,121      cycles                           #    3.006 GHz                    
-    18,470,926,867      instructions                     #    2.23  insn per cycle         
-       2.811196040 seconds time elapsed
+     8,303,278,275      cycles                           #    3.000 GHz                    
+    18,645,596,525      instructions                     #    2.25  insn per cycle         
+       2.826809106 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.235367e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.237271e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.237519e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.233958e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.236030e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.236303e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     3.996473 sec
+TOTAL       :     4.007873 sec
 INFO: No Floating Point Exceptions have been reported
-    13,050,093,352      cycles                           #    3.020 GHz                    
-    31,028,947,131      instructions                     #    2.38  insn per cycle         
-       4.377792426 seconds time elapsed
+    12,910,025,920      cycles                           #    2.976 GHz                    
+    30,025,616,729      instructions                     #    2.33  insn per cycle         
+       4.392667162 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.480305e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.480556e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.480556e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.875983e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.876201e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.876201e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.241708 sec
+TOTAL       :     6.703762 sec
 INFO: No Floating Point Exceptions have been reported
-    18,821,174,280      cycles                           #    3.014 GHz                    
-    53,941,028,180      instructions                     #    2.87  insn per cycle         
-       6.245850686 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:32036) (avx2:    0) (512y:    0) (512z:    0)
+    18,880,147,773      cycles                           #    2.815 GHz                    
+    53,931,698,860      instructions                     #    2.86  insn per cycle         
+       6.707560831 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:32023) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.609430e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.609514e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.609514e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.621951e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.622050e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.622050e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     3.285465 sec
+TOTAL       :     3.258110 sec
 INFO: No Floating Point Exceptions have been reported
-     9,981,253,621      cycles                           #    3.035 GHz                    
-    27,137,548,724      instructions                     #    2.72  insn per cycle         
-       3.289453787 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:96387) (avx2:    0) (512y:    0) (512z:    0)
+     9,846,977,880      cycles                           #    3.019 GHz                    
+    27,128,812,737      instructions                     #    2.76  insn per cycle         
+       3.262446550 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:96375) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.556423e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.556875e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.556875e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.448151e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.448577e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.448577e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.489857 sec
+TOTAL       :     1.533013 sec
 INFO: No Floating Point Exceptions have been reported
-     4,244,835,717      cycles                           #    2.843 GHz                    
-     9,591,311,003      instructions                     #    2.26  insn per cycle         
-       1.494070789 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:84996) (512y:    0) (512z:    0)
+     4,309,903,765      cycles                           #    2.805 GHz                    
+     9,584,249,957      instructions                     #    2.22  insn per cycle         
+       1.537048676 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:84978) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.060696e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.061222e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.061222e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.985777e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.986306e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.986306e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.305460 sec
+TOTAL       :     1.327029 sec
 INFO: No Floating Point Exceptions have been reported
-     3,733,750,272      cycles                           #    2.853 GHz                    
-     8,514,034,293      instructions                     #    2.28  insn per cycle         
-       1.309432612 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:80666) (512y:  239) (512z:    0)
+     3,743,360,462      cycles                           #    2.814 GHz                    
+     8,506,735,194      instructions                     #    2.27  insn per cycle         
+       1.330926412 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:80642) (512y:  239) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.642371e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.642906e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.642906e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.581234e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.581805e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.581805e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.454904 sec
+TOTAL       :     1.477295 sec
 INFO: No Floating Point Exceptions have been reported
-     2,700,741,707      cycles                           #    1.852 GHz                    
-     4,287,448,214      instructions                     #    1.59  insn per cycle         
-       1.458920670 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2713) (512y:  185) (512z:79103)
+     2,699,035,749      cycles                           #    1.824 GHz                    
+     4,280,090,319      instructions                     #    1.59  insn per cycle         
+       1.480967463 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2689) (512y:  185) (512z:79103)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
index 1ec1f218cc..b4fc180cc1 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2024-06-28_20:28:17
+DATE: 2024-08-08_20:02:00
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.293372e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.294106e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.294365e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.298150e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.298890e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.299224e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.186984e-05 +- 9.824899e-06 )  GeV^-6
-TOTAL       :     1.734398 sec
+TOTAL       :     1.751662 sec
 INFO: No Floating Point Exceptions have been reported
-     5,998,471,870      cycles                           #    3.013 GHz                    
-    12,941,477,945      instructions                     #    2.16  insn per cycle         
-       2.047491345 seconds time elapsed
+     5,936,795,436      cycles                           #    2.952 GHz                    
+    12,013,270,651      instructions                     #    2.02  insn per cycle         
+       2.067502844 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.166641e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.167275e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.167430e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.155180e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.155800e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.155887e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856829e-04 +- 8.333437e-05 )  GeV^-6
-TOTAL       :     2.041320 sec
+TOTAL       :     2.055202 sec
 INFO: No Floating Point Exceptions have been reported
-     6,882,695,469      cycles                           #    2.993 GHz                    
-    15,335,035,349      instructions                     #    2.23  insn per cycle         
-       2.356075989 seconds time elapsed
+     6,915,039,139      cycles                           #    2.986 GHz                    
+    14,633,712,669      instructions                     #    2.12  insn per cycle         
+       2.372054868 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.760616e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.760920e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.760920e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.752648e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.752917e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.752917e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825040e-06 )  GeV^-6
-TOTAL       :     6.032900 sec
+TOTAL       :     6.035465 sec
 INFO: No Floating Point Exceptions have been reported
-    18,362,673,487      cycles                           #    3.042 GHz                    
-    53,916,281,327      instructions                     #    2.94  insn per cycle         
-       6.036929247 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:20155) (avx2:    0) (512y:    0) (512z:    0)
+    18,171,458,820      cycles                           #    3.009 GHz                    
+    53,912,614,149      instructions                     #    2.97  insn per cycle         
+       6.039280806 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:20142) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.307614e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.308000e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.308000e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.468219e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.468626e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.468626e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825037e-06 )  GeV^-6
-TOTAL       :     1.602584 sec
+TOTAL       :     1.524160 sec
 INFO: No Floating Point Exceptions have been reported
-     4,635,363,924      cycles                           #    2.886 GHz                    
-    13,813,886,206      instructions                     #    2.98  insn per cycle         
-       1.606784601 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:97034) (avx2:    0) (512y:    0) (512z:    0)
+     4,594,690,732      cycles                           #    3.008 GHz                    
+    13,806,361,271      instructions                     #    3.00  insn per cycle         
+       1.528090955 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:97022) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.014969e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.016646e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.016646e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.022651e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.024377e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.024377e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.757122 sec
+TOTAL       :     0.754295 sec
 INFO: No Floating Point Exceptions have been reported
-     2,168,880,892      cycles                           #    2.852 GHz                    
-     4,843,034,742      instructions                     #    2.23  insn per cycle         
-       0.761157343 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:85515) (512y:    0) (512z:    0)
+     2,137,910,409      cycles                           #    2.822 GHz                    
+     4,835,783,841      instructions                     #    2.26  insn per cycle         
+       0.758250875 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:85497) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.123850e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.126172e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.126172e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.922130e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.924339e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.924339e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.654383 sec
+TOTAL       :     0.668838 sec
 INFO: No Floating Point Exceptions have been reported
-     1,871,543,363      cycles                           #    2.845 GHz                    
-     4,297,216,044      instructions                     #    2.30  insn per cycle         
-       0.658314207 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:81214) (512y:   44) (512z:    0)
+     1,877,666,899      cycles                           #    2.793 GHz                    
+     4,290,021,460      instructions                     #    2.28  insn per cycle         
+       0.672738963 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:81190) (512y:   44) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.212794e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.214860e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.214860e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.249467e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.251538e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.251538e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826771e-06 )  GeV^-6
-TOTAL       :     0.737304 sec
+TOTAL       :     0.730439 sec
 INFO: No Floating Point Exceptions have been reported
-     1,364,348,136      cycles                           #    1.843 GHz                    
-     2,168,826,929      instructions                     #    1.59  insn per cycle         
-       0.741185900 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3493) (512y:   47) (512z:79334)
+     1,353,764,576      cycles                           #    1.845 GHz                    
+     2,161,505,151      instructions                     #    1.60  insn per cycle         
+       0.734391470 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3469) (512y:   47) (512z:79334)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt
index 19c3a156fb..2973bcd9f9 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2024-06-28_20:54:09
+DATE: 2024-08-08_20:21:41
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gp
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.304682e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.306407e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.306407e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.303570e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.305124e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.305124e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187093e-05 +- 9.825663e-06 )  GeV^-6
-TOTAL       :     1.682174 sec
+TOTAL       :     1.683838 sec
 INFO: No Floating Point Exceptions have been reported
-     5,852,031,284      cycles                           #    3.020 GHz                    
-    12,722,485,881      instructions                     #    2.17  insn per cycle         
-       1.994323218 seconds time elapsed
+     5,740,674,837      cycles                           #    2.959 GHz                    
+    12,183,340,475      instructions                     #    2.12  insn per cycle         
+       1.996602458 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -79,18 +79,20 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.119434e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.130595e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.130595e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.128072e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.139024e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.139024e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856440e-04 +- 8.331091e-05 )  GeV^-6
-TOTAL       :     2.023198 sec
+TOTAL       :     2.036931 sec
 INFO: No Floating Point Exceptions have been reported
-     6,866,209,578      cycles                           #    3.012 GHz                    
-    14,247,651,349      instructions                     #    2.08  insn per cycle         
-       2.335730547 seconds time elapsed
+     6,817,978,012      cycles                           #    2.973 GHz                    
+    15,086,512,597      instructions                     #    2.21  insn per cycle         
+       2.349967443 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -108,20 +110,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.811807e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.812078e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.812078e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.676163e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.676428e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.676428e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825040e-06 )  GeV^-6
-TOTAL       :     5.996292 sec
+TOTAL       :     6.087276 sec
 INFO: No Floating Point Exceptions have been reported
-    18,222,904,500      cycles                           #    3.038 GHz                    
-    53,917,190,034      instructions                     #    2.96  insn per cycle         
-       6.000226691 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:20155) (avx2:    0) (512y:    0) (512z:    0)
+    18,179,826,190      cycles                           #    2.985 GHz                    
+    53,910,247,266      instructions                     #    2.97  insn per cycle         
+       6.091212728 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:20142) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -137,20 +140,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.510518e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.510973e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.510973e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.464690e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.465102e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.465102e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825037e-06 )  GeV^-6
-TOTAL       :     1.509640 sec
+TOTAL       :     1.525630 sec
 INFO: No Floating Point Exceptions have been reported
-     4,625,465,113      cycles                           #    3.057 GHz                    
-    13,814,506,056      instructions                     #    2.99  insn per cycle         
-       1.513610135 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:97034) (avx2:    0) (512y:    0) (512z:    0)
+     4,590,585,740      cycles                           #    3.003 GHz                    
+    13,807,319,566      instructions                     #    3.01  insn per cycle         
+       1.529386769 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:97022) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -166,20 +170,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.936529e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.938180e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.938180e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.967974e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.969738e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.969738e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.766429 sec
+TOTAL       :     0.760104 sec
 INFO: No Floating Point Exceptions have been reported
-     2,173,283,102      cycles                           #    2.824 GHz                    
-     4,844,170,694      instructions                     #    2.23  insn per cycle         
-       0.770393352 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:85515) (512y:    0) (512z:    0)
+     2,138,286,262      cycles                           #    2.802 GHz                    
+     4,837,282,487      instructions                     #    2.26  insn per cycle         
+       0.763970265 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:85497) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -195,20 +200,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.970392e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.972585e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.972585e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.967332e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.969544e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.969544e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.667630 sec
+TOTAL       :     0.664857 sec
 INFO: No Floating Point Exceptions have been reported
-     1,889,307,000      cycles                           #    2.817 GHz                    
-     4,298,323,850      instructions                     #    2.28  insn per cycle         
-       0.671620510 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:81214) (512y:   44) (512z:    0)
+     1,870,319,411      cycles                           #    2.799 GHz                    
+     4,291,006,476      instructions                     #    2.29  insn per cycle         
+       0.668734591 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:81190) (512y:   44) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -224,20 +230,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.302679e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.304907e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.304907e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.241242e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.243401e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.243401e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826771e-06 )  GeV^-6
-TOTAL       :     0.728718 sec
+TOTAL       :     0.731334 sec
 INFO: No Floating Point Exceptions have been reported
-     1,362,980,476      cycles                           #    1.862 GHz                    
-     2,169,928,812      instructions                     #    1.59  insn per cycle         
-       0.732739722 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3493) (512y:   47) (512z:79334)
+     1,357,966,074      cycles                           #    1.849 GHz                    
+     2,162,865,434      instructions                     #    1.59  insn per cycle         
+       0.735255583 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3469) (512y:   47) (512z:79334)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt
index ca329577bc..cfac3f719e 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2024-06-28_20:28:55
+DATE: 2024-08-08_20:02:49
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.286645e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.287498e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.287809e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.289590e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.290901e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.291153e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.186984e-05 +- 9.824899e-06 )  GeV^-6
-TOTAL       :     1.736228 sec
+TOTAL       :     1.752222 sec
 INFO: No Floating Point Exceptions have been reported
-     6,001,807,556      cycles                           #    3.011 GHz                    
-    11,967,206,185      instructions                     #    1.99  insn per cycle         
-       2.050151331 seconds time elapsed
+     6,011,479,262      cycles                           #    2.988 GHz                    
+    11,822,786,435      instructions                     #    1.97  insn per cycle         
+       2.068235514 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.143364e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.143971e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.144046e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.118039e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.118627e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.118705e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856829e-04 +- 8.333437e-05 )  GeV^-6
-TOTAL       :     2.058089 sec
+TOTAL       :     2.087174 sec
 INFO: No Floating Point Exceptions have been reported
-     7,010,517,846      cycles                           #    3.026 GHz                    
-    15,562,554,743      instructions                     #    2.22  insn per cycle         
-       2.372415375 seconds time elapsed
+     7,020,765,748      cycles                           #    2.977 GHz                    
+    15,445,166,662      instructions                     #    2.20  insn per cycle         
+       2.414506634 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.771160e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.771418e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.771418e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.753426e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.753693e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.753693e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825040e-06 )  GeV^-6
-TOTAL       :     6.026290 sec
+TOTAL       :     6.033711 sec
 INFO: No Floating Point Exceptions have been reported
-    18,379,600,352      cycles                           #    3.049 GHz                    
-    53,901,368,839      instructions                     #    2.93  insn per cycle         
-       6.030226018 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:20155) (avx2:    0) (512y:    0) (512z:    0)
+    18,095,249,979      cycles                           #    2.998 GHz                    
+    53,894,797,748      instructions                     #    2.98  insn per cycle         
+       6.037598164 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:20142) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.509821e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.510256e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.510256e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.476703e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.477111e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.477111e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825037e-06 )  GeV^-6
-TOTAL       :     1.509733 sec
+TOTAL       :     1.520725 sec
 INFO: No Floating Point Exceptions have been reported
-     4,595,160,001      cycles                           #    3.037 GHz                    
-    13,806,656,105      instructions                     #    3.00  insn per cycle         
-       1.513907845 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:96669) (avx2:    0) (512y:    0) (512z:    0)
+     4,582,334,771      cycles                           #    3.007 GHz                    
+    13,799,523,503      instructions                     #    3.01  insn per cycle         
+       1.524516230 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:96657) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.059545e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.061253e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.061253e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.920572e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.922271e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.922271e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.752009 sec
+TOTAL       :     0.764164 sec
 INFO: No Floating Point Exceptions have been reported
-     2,140,130,377      cycles                           #    2.833 GHz                    
-     4,847,155,413      instructions                     #    2.26  insn per cycle         
-       0.756177798 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:85905) (512y:    0) (512z:    0)
+     2,153,123,984      cycles                           #    2.806 GHz                    
+     4,840,163,805      instructions                     #    2.25  insn per cycle         
+       0.767980176 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:85887) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.995651e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.997972e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.997972e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.954158e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.956209e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.956209e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.665225 sec
+TOTAL       :     0.665841 sec
 INFO: No Floating Point Exceptions have been reported
-     1,902,280,846      cycles                           #    2.845 GHz                    
-     4,300,952,608      instructions                     #    2.26  insn per cycle         
-       0.669196467 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:81754) (512y:   24) (512z:    0)
+     1,891,343,146      cycles                           #    2.826 GHz                    
+     4,293,658,543      instructions                     #    2.27  insn per cycle         
+       0.669786991 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:81730) (512y:   24) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.312392e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.314887e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.314887e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.171151e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.173263e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.173263e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826771e-06 )  GeV^-6
-TOTAL       :     0.728015 sec
+TOTAL       :     0.740474 sec
 INFO: No Floating Point Exceptions have been reported
-     1,370,437,119      cycles                           #    1.874 GHz                    
-     2,175,702,006      instructions                     #    1.59  insn per cycle         
-       0.731950621 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4106) (512y:   32) (512z:79555)
+     1,358,622,018      cycles                           #    1.827 GHz                    
+     2,168,397,288      instructions                     #    1.60  insn per cycle         
+       0.744609857 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4082) (512y:   32) (512z:79555)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
index 851a39f552..30f43d1d54 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2024-06-28_20:29:34
+DATE: 2024-08-08_20:03:38
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.691814e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.692323e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.692498e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.679462e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.679946e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.680144e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     2.173801 sec
+TOTAL       :     2.195383 sec
 INFO: No Floating Point Exceptions have been reported
-     7,533,775,938      cycles                           #    3.018 GHz                    
-    16,627,293,057      instructions                     #    2.21  insn per cycle         
-       2.552141855 seconds time elapsed
+     7,438,879,261      cycles                           #    2.953 GHz                    
+    16,326,818,821      instructions                     #    2.19  insn per cycle         
+       2.577345674 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.107805e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.108082e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.108125e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.108202e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.108498e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.108526e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     3.402776 sec
+TOTAL       :     3.425728 sec
 INFO: No Floating Point Exceptions have been reported
-    11,035,265,539      cycles                           #    2.949 GHz                    
-    25,209,132,705      instructions                     #    2.28  insn per cycle         
-       3.797925240 seconds time elapsed
+    11,268,079,350      cycles                           #    3.003 GHz                    
+    26,526,619,371      instructions                     #    2.35  insn per cycle         
+       3.809078207 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.863438e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.863651e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.863651e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.696399e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.696636e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.696636e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.720162 sec
+TOTAL       :     6.867954 sec
 INFO: No Floating Point Exceptions have been reported
-    19,171,030,465      cycles                           #    2.852 GHz                    
-    54,139,658,780      instructions                     #    2.82  insn per cycle         
-       6.724069680 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:32014) (avx2:    0) (512y:    0) (512z:    0)
+    19,211,187,371      cycles                           #    2.796 GHz                    
+    54,136,498,902      instructions                     #    2.82  insn per cycle         
+       6.871886606 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:32001) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.605795e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.605881e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.605881e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.599481e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.599571e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.599571e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     3.294366 sec
+TOTAL       :     3.303538 sec
 INFO: No Floating Point Exceptions have been reported
-     9,394,914,106      cycles                           #    2.849 GHz                    
-    26,193,668,002      instructions                     #    2.79  insn per cycle         
-       3.298386549 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:96060) (avx2:    0) (512y:    0) (512z:    0)
+     9,333,906,777      cycles                           #    2.823 GHz                    
+    26,186,384,503      instructions                     #    2.81  insn per cycle         
+       3.307369825 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:96048) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.715414e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.715875e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.715875e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.642781e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.643249e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.643249e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.427067 sec
+TOTAL       :     1.453378 sec
 INFO: No Floating Point Exceptions have been reported
-     4,066,233,290      cycles                           #    2.843 GHz                    
-     9,255,492,192      instructions                     #    2.28  insn per cycle         
-       1.431038183 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:84396) (512y:    0) (512z:    0)
+     4,089,405,470      cycles                           #    2.807 GHz                    
+     9,248,953,263      instructions                     #    2.26  insn per cycle         
+       1.457404649 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:84378) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.244987e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.245575e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.245575e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.265363e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.265985e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.265985e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.249256 sec
+TOTAL       :     1.239836 sec
 INFO: No Floating Point Exceptions have been reported
-     3,550,344,735      cycles                           #    2.835 GHz                    
-     8,190,000,965      instructions                     #    2.31  insn per cycle         
-       1.253239411 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:80027) (512y:   79) (512z:    0)
+     3,507,542,927      cycles                           #    2.822 GHz                    
+     8,182,646,854      instructions                     #    2.33  insn per cycle         
+       1.243760162 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:80003) (512y:   79) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.733572e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.734190e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.734190e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.616663e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.617178e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.617178e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.419842 sec
+TOTAL       :     1.461797 sec
 INFO: No Floating Point Exceptions have been reported
-     2,629,781,082      cycles                           #    1.848 GHz                    
-     4,178,874,518      instructions                     #    1.59  insn per cycle         
-       1.423895743 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2613) (512y:   93) (512z:78909)
+     2,666,404,255      cycles                           #    1.820 GHz                    
+     4,171,669,153      instructions                     #    1.56  insn per cycle         
+       1.465874998 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2589) (512y:   93) (512z:78909)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt
index 01dde53669..7b7d65b2d2 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
-DATE: 2024-06-28_20:30:25
+DATE: 2024-08-08_20:04:45
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.677056e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.677559e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.677740e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.675385e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.675879e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.676008e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     2.176443 sec
+TOTAL       :     2.190431 sec
 INFO: No Floating Point Exceptions have been reported
-     7,487,683,814      cycles                           #    2.999 GHz                    
-    16,520,220,397      instructions                     #    2.21  insn per cycle         
-       2.555460487 seconds time elapsed
+     7,517,385,120      cycles                           #    2.989 GHz                    
+    15,570,357,961      instructions                     #    2.07  insn per cycle         
+       2.571136488 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.110219e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.110497e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.110535e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.109468e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.109746e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.109778e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     3.410801 sec
+TOTAL       :     3.419906 sec
 INFO: No Floating Point Exceptions have been reported
-    11,197,151,914      cycles                           #    2.999 GHz                    
-    25,393,591,354      instructions                     #    2.27  insn per cycle         
-       3.792719865 seconds time elapsed
+    11,221,781,722      cycles                           #    2.994 GHz                    
+    24,236,211,120      instructions                     #    2.16  insn per cycle         
+       3.803243859 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.710962e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.711166e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.711166e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.902849e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.903107e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.903107e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.858638 sec
+TOTAL       :     6.673081 sec
 INFO: No Floating Point Exceptions have been reported
-    19,224,649,899      cycles                           #    2.802 GHz                    
-    54,164,766,817      instructions                     #    2.82  insn per cycle         
-       6.862669409 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:32216) (avx2:    0) (512y:    0) (512z:    0)
+    19,149,429,604      cycles                           #    2.868 GHz                    
+    54,156,492,076      instructions                     #    2.83  insn per cycle         
+       6.676939828 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:32203) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.613373e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.613459e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.613459e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.571432e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.571520e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.571520e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     3.277132 sec
+TOTAL       :     3.363251 sec
 INFO: No Floating Point Exceptions have been reported
-     9,318,655,439      cycles                           #    2.841 GHz                    
-    26,093,521,108      instructions                     #    2.80  insn per cycle         
-       3.281273932 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:95949) (avx2:    0) (512y:    0) (512z:    0)
+     9,398,223,848      cycles                           #    2.792 GHz                    
+    26,086,325,143      instructions                     #    2.78  insn per cycle         
+       3.367354553 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:95937) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.612730e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.613184e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.613184e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.625397e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.625854e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.625854e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.466168 sec
+TOTAL       :     1.456994 sec
 INFO: No Floating Point Exceptions have been reported
-     4,047,902,147      cycles                           #    2.756 GHz                    
-     9,220,799,448      instructions                     #    2.28  insn per cycle         
-       1.470272690 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:83870) (512y:    0) (512z:    0)
+     4,075,335,135      cycles                           #    2.792 GHz                    
+     9,212,511,442      instructions                     #    2.26  insn per cycle         
+       1.460794766 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:83852) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.298679e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.299308e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.299308e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.243367e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.244047e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.244047e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.234280 sec
+TOTAL       :     1.245554 sec
 INFO: No Floating Point Exceptions have been reported
-     3,526,966,960      cycles                           #    2.851 GHz                    
-     8,174,270,516      instructions                     #    2.32  insn per cycle         
-       1.238238467 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:79433) (512y:  229) (512z:    0)
+     3,512,150,002      cycles                           #    2.812 GHz                    
+     8,166,955,109      instructions                     #    2.33  insn per cycle         
+       1.249525029 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:79409) (512y:  229) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.733439e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.733991e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.733991e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.660094e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.660683e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.660683e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.419623 sec
+TOTAL       :     1.444444 sec
 INFO: No Floating Point Exceptions have been reported
-     2,627,097,541      cycles                           #    1.846 GHz                    
-     4,173,730,813      instructions                     #    1.59  insn per cycle         
-       1.423630153 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1877) (512y:  175) (512z:78883)
+     2,623,623,826      cycles                           #    1.812 GHz                    
+     4,166,476,704      instructions                     #    1.59  insn per cycle         
+       1.448438406 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1853) (512y:  175) (512z:78883)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
index 6c1b4ea2ca..dc70f1aa96 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2024-06-28_20:25:15
+DATE: 2024-08-08_19:58:27
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.790022e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.356109e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.700619e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.793830e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.275665e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.618309e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.453336 sec
+TOTAL       :     0.446682 sec
 INFO: No Floating Point Exceptions have been reported
-     1,989,983,180      cycles                           #    2.947 GHz                    
-     2,785,844,872      instructions                     #    1.40  insn per cycle         
-       0.788940065 seconds time elapsed
+     1,973,218,669      cycles                           #    2.938 GHz                    
+     2,737,206,349      instructions                     #    1.39  insn per cycle         
+       0.728215190 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.587316e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.180244e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.537381e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.512201e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.215148e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.564113e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.602505e+02 +- 2.116328e+02 )  GeV^-2
-TOTAL       :     0.524586 sec
+TOTAL       :     0.528377 sec
 INFO: No Floating Point Exceptions have been reported
-     2,302,234,774      cycles                           #    2.983 GHz                    
-     3,293,556,186      instructions                     #    1.43  insn per cycle         
-       0.829451283 seconds time elapsed
+     2,273,295,859      cycles                           #    2.942 GHz                    
+     3,270,605,178      instructions                     #    1.44  insn per cycle         
+       0.829840488 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.097573e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.120521e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.120521e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.087919e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.111512e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.111512e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     1.514920 sec
+TOTAL       :     1.525836 sec
 INFO: No Floating Point Exceptions have been reported
-     4,626,793,284      cycles                           #    3.048 GHz                    
-    13,198,012,215      instructions                     #    2.85  insn per cycle         
-       1.522558693 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  720) (avx2:    0) (512y:    0) (512z:    0)
+     4,620,985,524      cycles                           #    3.021 GHz                    
+    13,191,789,695      instructions                     #    2.85  insn per cycle         
+       1.530034055 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  707) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.950778e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.023577e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.023577e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.913767e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.985469e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.985469e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.860945 sec
+TOTAL       :     0.875694 sec
 INFO: No Floating Point Exceptions have been reported
-     2,649,662,880      cycles                           #    3.065 GHz                    
-     7,563,277,875      instructions                     #    2.85  insn per cycle         
-       0.868046964 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3116) (avx2:    0) (512y:    0) (512z:    0)
+     2,645,390,944      cycles                           #    3.009 GHz                    
+     7,556,169,585      instructions                     #    2.86  insn per cycle         
+       0.879849311 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3099) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.288545e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.501162e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.501162e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.250464e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.457998e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.457998e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.521153 sec
+TOTAL       :     0.522755 sec
 INFO: No Floating Point Exceptions have been reported
-     1,496,320,787      cycles                           #    2.854 GHz                    
-     3,166,843,366      instructions                     #    2.12  insn per cycle         
-       0.528925313 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3002) (512y:    0) (512z:    0)
+     1,489,187,494      cycles                           #    2.830 GHz                    
+     3,159,085,018      instructions                     #    2.12  insn per cycle         
+       0.526770948 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2984) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.533993e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.785485e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.785485e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.609694e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.866945e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.866945e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.487338 sec
+TOTAL       :     0.473366 sec
 INFO: No Floating Point Exceptions have been reported
-     1,356,346,660      cycles                           #    2.768 GHz                    
-     3,021,733,089      instructions                     #    2.23  insn per cycle         
-       0.493578513 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2769) (512y:  104) (512z:    0)
+     1,347,276,225      cycles                           #    2.825 GHz                    
+     3,016,026,977      instructions                     #    2.24  insn per cycle         
+       0.477451794 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2745) (512y:  104) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.478813e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.598693e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.598693e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.459896e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.579821e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.579821e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.686044 sec
+TOTAL       :     0.687520 sec
 INFO: No Floating Point Exceptions have been reported
-     1,333,386,243      cycles                           #    1.934 GHz                    
-     1,969,640,769      instructions                     #    1.48  insn per cycle         
-       0.692703053 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1391) (512y:  106) (512z: 2217)
+     1,326,541,553      cycles                           #    1.920 GHz                    
+     1,964,358,241      instructions                     #    1.48  insn per cycle         
+       0.691777094 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1367) (512y:  106) (512z: 2217)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt
index cab9bcc977..280fcce352 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2024-06-28_20:51:54
+DATE: 2024-08-08_20:19:09
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.479240e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.008789e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.008789e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.684298e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.299204e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.299204e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.475656 sec
+TOTAL       :     0.471497 sec
 INFO: No Floating Point Exceptions have been reported
-     2,038,893,126      cycles                           #    2.960 GHz                    
-     2,977,070,828      instructions                     #    1.46  insn per cycle         
-       0.745313870 seconds time elapsed
+     2,016,663,667      cycles                           #    2.932 GHz                    
+     2,996,818,007      instructions                     #    1.49  insn per cycle         
+       0.744526851 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -79,18 +79,20 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.327342e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.480237e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.480237e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.407307e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.579683e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.579683e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.602505e+02 +- 2.116328e+02 )  GeV^-2
-TOTAL       :     0.748104 sec
+TOTAL       :     0.738495 sec
 INFO: No Floating Point Exceptions have been reported
-     2,948,004,634      cycles                           #    2.961 GHz                    
-     4,497,963,430      instructions                     #    1.53  insn per cycle         
-       1.054638211 seconds time elapsed
+     2,913,311,119      cycles                           #    2.959 GHz                    
+     4,473,148,579      instructions                     #    1.54  insn per cycle         
+       1.042109459 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -108,20 +110,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.078327e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.102056e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.102056e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.071825e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.094847e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.094847e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     1.548417 sec
+TOTAL       :     1.553859 sec
 INFO: No Floating Point Exceptions have been reported
-     4,667,469,364      cycles                           #    3.007 GHz                    
-    13,202,731,600      instructions                     #    2.83  insn per cycle         
-       1.552883532 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  720) (avx2:    0) (512y:    0) (512z:    0)
+     4,647,790,593      cycles                           #    2.984 GHz                    
+    13,197,257,990      instructions                     #    2.84  insn per cycle         
+       1.558215122 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  707) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -137,20 +140,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.932132e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.004159e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.004159e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.902347e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.973784e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.973784e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.875687 sec
+TOTAL       :     0.886591 sec
 INFO: No Floating Point Exceptions have been reported
-     2,682,056,903      cycles                           #    3.050 GHz                    
-     7,610,496,007      instructions                     #    2.84  insn per cycle         
-       0.880075632 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3116) (avx2:    0) (512y:    0) (512z:    0)
+     2,676,044,915      cycles                           #    3.006 GHz                    
+     7,604,510,010      instructions                     #    2.84  insn per cycle         
+       0.890913281 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3099) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -166,20 +170,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.293837e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.504426e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.504426e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.212543e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.422665e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.422665e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.526616 sec
+TOTAL       :     0.536325 sec
 INFO: No Floating Point Exceptions have been reported
-     1,529,323,143      cycles                           #    2.883 GHz                    
-     3,217,059,809      instructions                     #    2.10  insn per cycle         
-       0.531008511 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3002) (512y:    0) (512z:    0)
+     1,528,484,723      cycles                           #    2.830 GHz                    
+     3,209,947,960      instructions                     #    2.10  insn per cycle         
+       0.540711031 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2984) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -195,20 +200,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.666271e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.933201e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.933201e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.560716e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.811838e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.811838e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.475965 sec
+TOTAL       :     0.486090 sec
 INFO: No Floating Point Exceptions have been reported
-     1,385,254,715      cycles                           #    2.890 GHz                    
-     3,070,395,317      instructions                     #    2.22  insn per cycle         
-       0.480322475 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2769) (512y:  104) (512z:    0)
+     1,376,959,578      cycles                           #    2.811 GHz                    
+     3,063,340,210      instructions                     #    2.22  insn per cycle         
+       0.490411106 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2745) (512y:  104) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -224,20 +230,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.294434e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.409827e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.409827e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.438051e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.554379e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.554379e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.747092 sec
+TOTAL       :     0.699323 sec
 INFO: No Floating Point Exceptions have been reported
-     1,380,748,031      cycles                           #    1.839 GHz                    
-     2,007,713,125      instructions                     #    1.45  insn per cycle         
-       0.751567886 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1391) (512y:  106) (512z: 2217)
+     1,353,225,054      cycles                           #    1.926 GHz                    
+     1,999,803,163      instructions                     #    1.48  insn per cycle         
+       0.703554082 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1367) (512y:  106) (512z: 2217)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt
index 1c33195921..0801a72f2e 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2024-06-28_20:25:28
+DATE: 2024-08-08_19:58:40
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.775583e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.188831e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.517095e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.715940e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.160616e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.486831e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.446354 sec
+TOTAL       :     0.449924 sec
 INFO: No Floating Point Exceptions have been reported
-     1,978,395,800      cycles                           #    2.952 GHz                    
-     2,771,408,017      instructions                     #    1.40  insn per cycle         
-       0.796607303 seconds time elapsed
+     1,942,000,933      cycles                           #    2.932 GHz                    
+     2,723,193,332      instructions                     #    1.40  insn per cycle         
+       0.721112435 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.570046e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.090310e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.431455e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.484674e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.054198e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.395966e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.602505e+02 +- 2.116328e+02 )  GeV^-2
-TOTAL       :     0.526554 sec
+TOTAL       :     0.530941 sec
 INFO: No Floating Point Exceptions have been reported
-     2,277,253,734      cycles                           #    2.972 GHz                    
-     3,270,762,044      instructions                     #    1.44  insn per cycle         
-       0.823669445 seconds time elapsed
+     2,253,028,696      cycles                           #    2.947 GHz                    
+     3,232,782,518      instructions                     #    1.43  insn per cycle         
+       0.823488099 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.099420e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.122937e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.122937e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.055734e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.078647e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.078647e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     1.512747 sec
+TOTAL       :     1.572076 sec
 INFO: No Floating Point Exceptions have been reported
-     4,627,982,694      cycles                           #    3.052 GHz                    
-    13,186,184,949      instructions                     #    2.85  insn per cycle         
-       1.519924373 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  705) (avx2:    0) (512y:    0) (512z:    0)
+     4,625,532,940      cycles                           #    2.937 GHz                    
+    13,181,547,125      instructions                     #    2.85  insn per cycle         
+       1.575799334 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  692) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.949013e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.023241e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.023241e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.856450e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.926302e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.926302e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.863276 sec
+TOTAL       :     0.902285 sec
 INFO: No Floating Point Exceptions have been reported
-     2,643,207,158      cycles                           #    3.053 GHz                    
-     7,560,625,846      instructions                     #    2.86  insn per cycle         
-       0.869204519 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3110) (avx2:    0) (512y:    0) (512z:    0)
+     2,641,918,143      cycles                           #    2.918 GHz                    
+     7,554,356,585      instructions                     #    2.86  insn per cycle         
+       0.906092774 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3093) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.287048e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.500557e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.500557e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.249746e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.464508e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.464508e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.520133 sec
+TOTAL       :     0.523830 sec
 INFO: No Floating Point Exceptions have been reported
-     1,497,606,910      cycles                           #    2.858 GHz                    
-     3,165,816,703      instructions                     #    2.11  insn per cycle         
-       0.527177034 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2987) (512y:    0) (512z:    0)
+     1,491,771,401      cycles                           #    2.831 GHz                    
+     3,160,437,103      instructions                     #    2.12  insn per cycle         
+       0.527543251 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2969) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.489743e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.732662e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.732662e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.610049e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.870786e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.870786e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.492703 sec
+TOTAL       :     0.473152 sec
 INFO: No Floating Point Exceptions have been reported
-     1,355,352,613      cycles                           #    2.736 GHz                    
-     3,018,112,059      instructions                     #    2.23  insn per cycle         
-       0.498338855 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2743) (512y:  104) (512z:    0)
+     1,347,000,026      cycles                           #    2.829 GHz                    
+     3,012,563,261      instructions                     #    2.24  insn per cycle         
+       0.476761119 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2719) (512y:  104) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.482908e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.602468e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.602468e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.451125e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.569830e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.569830e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.684096 sec
+TOTAL       :     0.689809 sec
 INFO: No Floating Point Exceptions have been reported
-     1,331,295,161      cycles                           #    1.935 GHz                    
-     1,967,793,261      instructions                     #    1.48  insn per cycle         
-       0.690431223 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1368) (512y:  106) (512z: 2217)
+     1,325,269,157      cycles                           #    1.912 GHz                    
+     1,962,212,225      instructions                     #    1.48  insn per cycle         
+       0.693734086 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1344) (512y:  106) (512z: 2217)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
index 850aaf835d..776a8e7cf2 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2024-06-28_20:25:41
+DATE: 2024-08-08_19:58:53
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.293532e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.042506e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.137949e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.177753e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.044280e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.137137e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018174e+01 +- 1.429492e+01 )  GeV^-2
-TOTAL       :     0.451562 sec
+TOTAL       :     0.446256 sec
 INFO: No Floating Point Exceptions have been reported
-     1,965,513,776      cycles                           #    2.824 GHz                    
-     2,635,583,653      instructions                     #    1.34  insn per cycle         
-       0.859142260 seconds time elapsed
+     1,967,028,633      cycles                           #    2.927 GHz                    
+     2,729,560,871      instructions                     #    1.39  insn per cycle         
+       0.730482007 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 165
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.368956e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.537935e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.624387e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.302708e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.525963e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.623999e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.571360e+02 +- 2.114020e+02 )  GeV^-2
-TOTAL       :     0.475188 sec
+TOTAL       :     0.480335 sec
 INFO: No Floating Point Exceptions have been reported
-     2,098,261,664      cycles                           #    2.944 GHz                    
-     2,954,481,606      instructions                     #    1.41  insn per cycle         
-       0.769790018 seconds time elapsed
+     2,062,608,643      cycles                           #    2.922 GHz                    
+     2,954,769,461      instructions                     #    1.43  insn per cycle         
+       0.763163038 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.150060e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.176080e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.176080e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.132642e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.159370e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.159370e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018564e+01 +- 1.429903e+01 )  GeV^-2
-TOTAL       :     1.446431 sec
+TOTAL       :     1.464071 sec
 INFO: No Floating Point Exceptions have been reported
-     4,411,772,032      cycles                           #    3.045 GHz                    
-    12,958,210,870      instructions                     #    2.94  insn per cycle         
-       1.451536636 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  658) (avx2:    0) (512y:    0) (512z:    0)
+     4,406,453,406      cycles                           #    3.003 GHz                    
+    12,951,424,799      instructions                     #    2.94  insn per cycle         
+       1.468164938 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  645) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.900734e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.084469e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.084469e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.856948e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.035260e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.035260e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018564e+01 +- 1.429903e+01 )  GeV^-2
-TOTAL       :     0.586035 sec
+TOTAL       :     0.590761 sec
 INFO: No Floating Point Exceptions have been reported
-     1,737,693,108      cycles                           #    2.951 GHz                    
-     4,549,270,467      instructions                     #    2.62  insn per cycle         
-       0.593914325 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3638) (avx2:    0) (512y:    0) (512z:    0)
+     1,725,972,010      cycles                           #    2.906 GHz                    
+     4,541,556,745      instructions                     #    2.63  insn per cycle         
+       0.594447330 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3626) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.861820e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.580664e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.580664e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.798317e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.520080e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.520080e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.300710 sec
+TOTAL       :     0.300105 sec
 INFO: No Floating Point Exceptions have been reported
-       862,449,479      cycles                           #    2.844 GHz                    
-     1,924,591,814      instructions                     #    2.23  insn per cycle         
-       0.306904344 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3584) (512y:    0) (512z:    0)
+       854,524,206      cycles                           #    2.821 GHz                    
+     1,917,397,512      instructions                     #    2.24  insn per cycle         
+       0.303595328 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3566) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.340644e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.181705e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.181705e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.187295e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.004492e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.004492e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.279461 sec
+TOTAL       :     0.282163 sec
 INFO: No Floating Point Exceptions have been reported
-       808,719,930      cycles                           #    2.865 GHz                    
-     1,841,626,395      instructions                     #    2.28  insn per cycle         
-       0.284314491 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3414) (512y:   22) (512z:    0)
+       807,334,376      cycles                           #    2.832 GHz                    
+     1,834,144,656      instructions                     #    2.27  insn per cycle         
+       0.285676418 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3390) (512y:   22) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.796043e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.289013e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.289013e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.697538e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.170455e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.170455e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018829e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.364949 sec
+TOTAL       :     0.368301 sec
 INFO: No Floating Point Exceptions have been reported
-       736,833,879      cycles                           #    2.005 GHz                    
-     1,315,783,402      instructions                     #    1.79  insn per cycle         
-       0.370048804 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2005) (512y:   32) (512z: 2432)
+       729,603,114      cycles                           #    1.965 GHz                    
+     1,308,166,262      instructions                     #    1.79  insn per cycle         
+       0.371960958 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1942) (512y:   26) (512z: 2432)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt
index de1a622f9c..e112255ddc 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2024-06-28_20:52:07
+DATE: 2024-08-08_20:19:22
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.522257e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.599137e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.599137e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.675417e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.135496e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.135496e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.017654e+01 +- 1.429183e+01 )  GeV^-2
-TOTAL       :     0.455736 sec
+TOTAL       :     0.454896 sec
 INFO: No Floating Point Exceptions have been reported
-     1,976,950,200      cycles                           #    2.968 GHz                    
-     2,931,197,541      instructions                     #    1.48  insn per cycle         
-       0.723128470 seconds time elapsed
+     1,922,075,239      cycles                           #    2.886 GHz                    
+     2,812,656,009      instructions                     #    1.46  insn per cycle         
+       0.723103268 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
@@ -79,18 +79,20 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.174973e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.635538e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.635538e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.230387e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.891837e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.891837e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.609941e+02 +- 2.115589e+02 )  GeV^-2
-TOTAL       :     0.623300 sec
+TOTAL       :     0.622542 sec
 INFO: No Floating Point Exceptions have been reported
-     2,540,926,352      cycles                           #    2.970 GHz                    
-     3,858,630,672      instructions                     #    1.52  insn per cycle         
-       0.912725281 seconds time elapsed
+     2,509,793,238      cycles                           #    2.945 GHz                    
+     3,839,626,015      instructions                     #    1.53  insn per cycle         
+       0.910444487 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -108,20 +110,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.151240e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.177166e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.177166e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.133555e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.159187e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.159187e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018564e+01 +- 1.429903e+01 )  GeV^-2
-TOTAL       :     1.447433 sec
+TOTAL       :     1.466168 sec
 INFO: No Floating Point Exceptions have been reported
-     4,427,281,485      cycles                           #    3.051 GHz                    
-    12,962,707,314      instructions                     #    2.93  insn per cycle         
-       1.451753088 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  658) (avx2:    0) (512y:    0) (512z:    0)
+     4,419,438,233      cycles                           #    3.007 GHz                    
+    12,955,838,618      instructions                     #    2.93  insn per cycle         
+       1.470344991 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  645) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -137,20 +140,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.955554e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.138018e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.138018e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.929772e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.111984e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.111984e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018564e+01 +- 1.429903e+01 )  GeV^-2
-TOTAL       :     0.578329 sec
+TOTAL       :     0.580373 sec
 INFO: No Floating Point Exceptions have been reported
-     1,758,401,594      cycles                           #    3.021 GHz                    
-     4,597,076,777      instructions                     #    2.61  insn per cycle         
-       0.582909175 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3638) (avx2:    0) (512y:    0) (512z:    0)
+     1,747,268,230      cycles                           #    2.992 GHz                    
+     4,589,745,792      instructions                     #    2.63  insn per cycle         
+       0.584483983 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3626) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -166,20 +170,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.760381e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.479793e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.479793e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.766764e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.470194e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.470194e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.308970 sec
+TOTAL       :     0.305547 sec
 INFO: No Floating Point Exceptions have been reported
-       885,542,338      cycles                           #    2.833 GHz                    
-     1,961,389,336      instructions                     #    2.21  insn per cycle         
-       0.313214968 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3584) (512y:    0) (512z:    0)
+       873,235,026      cycles                           #    2.827 GHz                    
+     1,954,283,245      instructions                     #    2.24  insn per cycle         
+       0.309543568 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3566) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -195,20 +200,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.249080e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.082136e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.082136e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.204649e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.052966e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.052966e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.286383 sec
+TOTAL       :     0.285349 sec
 INFO: No Floating Point Exceptions have been reported
-       829,534,712      cycles                           #    2.859 GHz                    
-     1,878,022,002      instructions                     #    2.26  insn per cycle         
-       0.290621670 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3414) (512y:   22) (512z:    0)
+       822,856,149      cycles                           #    2.849 GHz                    
+     1,871,067,127      instructions                     #    2.27  insn per cycle         
+       0.289383401 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3390) (512y:   22) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -224,20 +230,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.707763e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.189867e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.189867e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.709235e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.178014e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.178014e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018829e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.375256 sec
+TOTAL       :     0.371559 sec
 INFO: No Floating Point Exceptions have been reported
-       759,283,515      cycles                           #    2.009 GHz                    
-     1,357,256,338      instructions                     #    1.79  insn per cycle         
-       0.379485732 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2005) (512y:   32) (512z: 2432)
+       748,105,287      cycles                           #    1.994 GHz                    
+     1,349,627,266      instructions                     #    1.80  insn per cycle         
+       0.375758776 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1942) (512y:   26) (512z: 2432)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt
index e0566e4426..f4c5647b28 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2024-06-28_20:25:53
+DATE: 2024-08-08_19:59:05
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.353171e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.062702e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.165312e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.121935e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.045477e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.150621e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018174e+01 +- 1.429492e+01 )  GeV^-2
-TOTAL       :     0.443834 sec
+TOTAL       :     0.441822 sec
 INFO: No Floating Point Exceptions have been reported
-     1,941,858,797      cycles                           #    2.954 GHz                    
-     2,754,746,518      instructions                     #    1.42  insn per cycle         
-       0.817724378 seconds time elapsed
+     1,919,824,453      cycles                           #    2.925 GHz                    
+     2,711,548,396      instructions                     #    1.41  insn per cycle         
+       0.712257308 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 164
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.429407e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.572953e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.663069e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.453927e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.579708e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.670884e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.571360e+02 +- 2.114020e+02 )  GeV^-2
-TOTAL       :     0.480050 sec
+TOTAL       :     0.482328 sec
 INFO: No Floating Point Exceptions have been reported
-     2,073,786,110      cycles                           #    2.949 GHz                    
-     2,967,574,310      instructions                     #    1.43  insn per cycle         
-       0.762050426 seconds time elapsed
+     2,075,215,740      cycles                           #    2.939 GHz                    
+     2,958,576,913      instructions                     #    1.43  insn per cycle         
+       0.765173729 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.132529e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.158514e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.158514e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.138812e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.164706e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.164706e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018564e+01 +- 1.429903e+01 )  GeV^-2
-TOTAL       :     1.467260 sec
+TOTAL       :     1.455800 sec
 INFO: No Floating Point Exceptions have been reported
-     4,412,522,593      cycles                           #    3.000 GHz                    
-    12,934,109,749      instructions                     #    2.93  insn per cycle         
-       1.475427698 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  643) (avx2:    0) (512y:    0) (512z:    0)
+     4,403,258,677      cycles                           #    3.018 GHz                    
+    12,926,930,475      instructions                     #    2.94  insn per cycle         
+       1.459744309 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  630) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.991463e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.176020e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.176020e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.936303e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.120025e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.120025e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018564e+01 +- 1.429903e+01 )  GeV^-2
-TOTAL       :     0.567363 sec
+TOTAL       :     0.574725 sec
 INFO: No Floating Point Exceptions have been reported
-     1,733,083,388      cycles                           #    3.035 GHz                    
-     4,543,937,033      instructions                     #    2.62  insn per cycle         
-       0.574183678 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3622) (avx2:    0) (512y:    0) (512z:    0)
+     1,726,777,095      cycles                           #    2.987 GHz                    
+     4,536,166,658      instructions                     #    2.63  insn per cycle         
+       0.578775017 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3610) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.771328e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.486383e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.486383e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.813817e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.547021e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.547021e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.304220 sec
+TOTAL       :     0.298922 sec
 INFO: No Floating Point Exceptions have been reported
-       863,108,492      cycles                           #    2.802 GHz                    
-     1,921,511,708      instructions                     #    2.23  insn per cycle         
-       0.311629404 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3554) (512y:    0) (512z:    0)
+       857,389,967      cycles                           #    2.838 GHz                    
+     1,914,305,415      instructions                     #    2.23  insn per cycle         
+       0.302780018 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3536) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.362713e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.211365e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.211365e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.307694e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.166095e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.166095e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.277439 sec
+TOTAL       :     0.276778 sec
 INFO: No Floating Point Exceptions have been reported
-       808,884,759      cycles                           #    2.877 GHz                    
-     1,837,754,812      instructions                     #    2.27  insn per cycle         
-       0.284539698 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3378) (512y:   22) (512z:    0)
+       801,815,801      cycles                           #    2.863 GHz                    
+     1,829,952,798      instructions                     #    2.28  insn per cycle         
+       0.280644988 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3354) (512y:   22) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.756673e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.241916e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.241916e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.668444e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.134327e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.134327e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018829e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.366455 sec
+TOTAL       :     0.370402 sec
 INFO: No Floating Point Exceptions have been reported
-       736,983,260      cycles                           #    1.990 GHz                    
-     1,313,795,216      instructions                     #    1.78  insn per cycle         
-       0.373998194 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1968) (512y:   32) (512z: 2435)
+       727,659,849      cycles                           #    1.947 GHz                    
+     1,306,194,061      instructions                     #    1.80  insn per cycle         
+       0.374419699 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1905) (512y:   26) (512z: 2435)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
index 302ee19ef4..14cf46cbcc 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2024-06-28_20:26:06
+DATE: 2024-08-08_19:59:17
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.857164e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.383212e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.731282e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.769849e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.334726e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.696577e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.445846 sec
+TOTAL       :     0.447945 sec
 INFO: No Floating Point Exceptions have been reported
-     1,989,381,537      cycles                           #    2.950 GHz                    
-     2,820,748,525      instructions                     #    1.42  insn per cycle         
-       0.741246343 seconds time elapsed
+     1,970,077,649      cycles                           #    2.938 GHz                    
+     2,764,650,199      instructions                     #    1.40  insn per cycle         
+       0.727384144 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.606744e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.198388e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.549914e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.502555e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.204679e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.563131e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.602505e+02 +- 2.116328e+02 )  GeV^-2
-TOTAL       :     0.526247 sec
+TOTAL       :     0.530343 sec
 INFO: No Floating Point Exceptions have been reported
-     2,280,337,997      cycles                           #    2.953 GHz                    
-     3,250,188,330      instructions                     #    1.43  insn per cycle         
-       0.829002703 seconds time elapsed
+     2,259,914,656      cycles                           #    2.930 GHz                    
+     3,250,253,432      instructions                     #    1.44  insn per cycle         
+       0.828686428 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.088780e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.111564e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.111564e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.069358e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.092261e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.092261e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     1.528773 sec
+TOTAL       :     1.552012 sec
 INFO: No Floating Point Exceptions have been reported
-     4,656,228,550      cycles                           #    3.043 GHz                    
-    13,184,186,040      instructions                     #    2.83  insn per cycle         
-       1.534455484 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  694) (avx2:    0) (512y:    0) (512z:    0)
+     4,641,202,069      cycles                           #    2.985 GHz                    
+    13,179,687,646      instructions                     #    2.84  insn per cycle         
+       1.555810770 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  681) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.955974e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.029658e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.029658e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.876933e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.946940e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.946940e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.859548 sec
+TOTAL       :     0.892460 sec
 INFO: No Floating Point Exceptions have been reported
-     2,646,287,948      cycles                           #    3.067 GHz                    
-     7,482,344,904      instructions                     #    2.83  insn per cycle         
-       0.865760719 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3164) (avx2:    0) (512y:    0) (512z:    0)
+     2,644,592,448      cycles                           #    2.953 GHz                    
+     7,475,728,591      instructions                     #    2.83  insn per cycle         
+       0.896244087 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3152) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.313508e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.535972e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.535972e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.303870e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.519584e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.519584e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.517719 sec
+TOTAL       :     0.515449 sec
 INFO: No Floating Point Exceptions have been reported
-     1,485,493,063      cycles                           #    2.852 GHz                    
-     3,134,701,988      instructions                     #    2.11  insn per cycle         
-       0.523996490 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3137) (512y:    0) (512z:    0)
+     1,473,674,467      cycles                           #    2.841 GHz                    
+     3,129,036,980      instructions                     #    2.12  insn per cycle         
+       0.519216773 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3119) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.732479e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.004169e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.004169e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.630465e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.893768e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.893768e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.461514 sec
+TOTAL       :     0.471176 sec
 INFO: No Floating Point Exceptions have been reported
-     1,325,634,826      cycles                           #    2.855 GHz                    
-     2,988,988,966      instructions                     #    2.25  insn per cycle         
-       0.467757291 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2905) (512y:  110) (512z:    0)
+     1,324,066,570      cycles                           #    2.791 GHz                    
+     2,982,910,932      instructions                     #    2.25  insn per cycle         
+       0.474943404 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2881) (512y:  110) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.417582e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.531347e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.531347e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.354541e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.462714e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.462714e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.702407 sec
+TOTAL       :     0.717844 sec
 INFO: No Floating Point Exceptions have been reported
-     1,370,431,429      cycles                           #    1.943 GHz                    
-     1,997,409,518      instructions                     #    1.46  insn per cycle         
-       0.709124260 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1737) (512y:  114) (512z: 2251)
+     1,364,512,931      cycles                           #    1.893 GHz                    
+     1,991,624,740      instructions                     #    1.46  insn per cycle         
+       0.721728207 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1656) (512y:  108) (512z: 2251)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt
index 25931b257e..5b20c017bf 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2024-06-28_20:26:19
+DATE: 2024-08-08_19:59:31
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.833950e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.211642e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.559558e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.764426e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.211229e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.545216e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.449366 sec
+TOTAL       :     0.450206 sec
 INFO: No Floating Point Exceptions have been reported
-     1,956,372,552      cycles                           #    2.947 GHz                    
-     2,728,568,804      instructions                     #    1.39  insn per cycle         
-       0.863711942 seconds time elapsed
+     1,949,946,468      cycles                           #    2.935 GHz                    
+     2,761,346,859      instructions                     #    1.42  insn per cycle         
+       0.722536101 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.575384e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.039636e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.376287e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.478869e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.028008e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.358881e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.602505e+02 +- 2.116328e+02 )  GeV^-2
-TOTAL       :     0.525393 sec
+TOTAL       :     0.526742 sec
 INFO: No Floating Point Exceptions have been reported
-     2,278,493,650      cycles                           #    2.960 GHz                    
-     3,260,222,958      instructions                     #    1.43  insn per cycle         
-       0.826868506 seconds time elapsed
+     2,265,443,315      cycles                           #    2.945 GHz                    
+     3,237,723,769      instructions                     #    1.43  insn per cycle         
+       0.826628143 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.088519e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.111448e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.111448e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.082497e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.105654e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.105654e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     1.526994 sec
+TOTAL       :     1.532875 sec
 INFO: No Floating Point Exceptions have been reported
-     4,638,191,750      cycles                           #    3.030 GHz                    
-    13,173,310,720      instructions                     #    2.84  insn per cycle         
-       1.535040418 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  679) (avx2:    0) (512y:    0) (512z:    0)
+     4,647,233,937      cycles                           #    3.025 GHz                    
+    13,168,093,251      instructions                     #    2.83  insn per cycle         
+       1.537009895 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  666) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.926197e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.997457e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.997457e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.916408e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.986697e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.986697e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.871595 sec
+TOTAL       :     0.873749 sec
 INFO: No Floating Point Exceptions have been reported
-     2,645,790,535      cycles                           #    3.021 GHz                    
-     7,484,268,569      instructions                     #    2.83  insn per cycle         
-       0.879191644 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3153) (avx2:    0) (512y:    0) (512z:    0)
+     2,638,584,974      cycles                           #    3.010 GHz                    
+     7,477,829,189      instructions                     #    2.83  insn per cycle         
+       0.877352084 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3141) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.372139e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.594706e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.594706e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.313421e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.533027e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.533027e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.507773 sec
+TOTAL       :     0.513511 sec
 INFO: No Floating Point Exceptions have been reported
-     1,475,182,947      cycles                           #    2.884 GHz                    
-     3,135,352,683      instructions                     #    2.13  insn per cycle         
-       0.514367920 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3115) (512y:    0) (512z:    0)
+     1,473,425,351      cycles                           #    2.852 GHz                    
+     3,129,237,400      instructions                     #    2.12  insn per cycle         
+       0.517237290 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3097) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.762089e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.038043e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.038043e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.703540e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.984962e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.984962e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.457004 sec
+TOTAL       :     0.461287 sec
 INFO: No Floating Point Exceptions have been reported
-     1,327,429,252      cycles                           #    2.880 GHz                    
-     2,989,142,789      instructions                     #    2.25  insn per cycle         
-       0.463756238 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2881) (512y:  110) (512z:    0)
+     1,320,825,681      cycles                           #    2.850 GHz                    
+     2,983,955,617      instructions                     #    2.26  insn per cycle         
+       0.465038534 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2857) (512y:  110) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.413330e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.527512e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.527512e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.367399e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.477116e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.477116e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.702513 sec
+TOTAL       :     0.713600 sec
 INFO: No Floating Point Exceptions have been reported
-     1,371,536,812      cycles                           #    1.941 GHz                    
-     1,997,365,728      instructions                     #    1.46  insn per cycle         
-       0.709597826 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1713) (512y:  114) (512z: 2251)
+     1,364,189,990      cycles                           #    1.903 GHz                    
+     1,991,688,961      instructions                     #    1.46  insn per cycle         
+       0.717422383 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1632) (512y:  108) (512z: 2251)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt
index f32a724279..83b828ef2e 100644
--- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
-DATE: 2024-06-28_21:12:11
+DATE: 2024-08-08_20:39:39
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.891269e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.097165e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.182586e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.966123e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.101302e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.184882e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     0.518623 sec
+TOTAL       :     0.517997 sec
 INFO: No Floating Point Exceptions have been reported
-     2,223,632,508      cycles                           #    2.966 GHz                    
-     3,217,766,625      instructions                     #    1.45  insn per cycle         
-       0.806241893 seconds time elapsed
+     2,197,627,386      cycles                           #    2.931 GHz                    
+     3,156,596,662      instructions                     #    1.44  insn per cycle         
+       0.806377685 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 226
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.688566e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.727462e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.727462e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.676906e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.715525e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.715525e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     6.319846 sec
+TOTAL       :     6.391723 sec
 INFO: No Floating Point Exceptions have been reported
-    19,281,617,903      cycles                           #    3.049 GHz                    
-    51,950,129,769      instructions                     #    2.69  insn per cycle         
-       6.325274001 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  681) (avx2:    0) (512y:    0) (512z:    0)
+    19,396,886,248      cycles                           #    3.031 GHz                    
+    52,050,532,705      instructions                     #    2.68  insn per cycle         
+       6.400835825 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  668) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.003345e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.138677e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.138677e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.012360e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.148434e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.148434e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     3.601344 sec
+TOTAL       :     3.619594 sec
 INFO: No Floating Point Exceptions have been reported
-    10,942,559,787      cycles                           #    3.035 GHz                    
-    30,789,312,911      instructions                     #    2.81  insn per cycle         
-       3.606762368 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2929) (avx2:    0) (512y:    0) (512z:    0)
+    11,008,104,240      cycles                           #    3.034 GHz                    
+    30,899,851,824      instructions                     #    2.81  insn per cycle         
+       3.628709587 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2914) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.784075e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.119038e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.119038e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.811277e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.159957e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.159957e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     2.300582 sec
+TOTAL       :     2.317730 sec
 INFO: No Floating Point Exceptions have been reported
-     6,469,586,871      cycles                           #    2.807 GHz                    
-    13,668,839,399      instructions                     #    2.11  insn per cycle         
-       2.306000495 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2952) (512y:    0) (512z:    0)
+     6,603,833,232      cycles                           #    2.839 GHz                    
+    13,785,660,246      instructions                     #    2.09  insn per cycle         
+       2.326886320 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2934) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.242513e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.669269e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.669269e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.274677e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.701182e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.701182e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     2.113701 sec
+TOTAL       :     2.128100 sec
 INFO: No Floating Point Exceptions have been reported
-     5,923,179,599      cycles                           #    2.796 GHz                    
-    13,009,174,359      instructions                     #    2.20  insn per cycle         
-       2.119346982 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2684) (512y:  146) (512z:    0)
+     6,037,170,556      cycles                           #    2.826 GHz                    
+    13,124,188,246      instructions                     #    2.17  insn per cycle         
+       2.137191260 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2660) (512y:  146) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.615364e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.807396e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.807396e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.546906e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.734269e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.734269e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     3.009511 sec
+TOTAL       :     3.095180 sec
 INFO: No Floating Point Exceptions have been reported
-     5,858,071,251      cycles                           #    1.944 GHz                    
-     8,592,010,560      instructions                     #    1.47  insn per cycle         
-       3.014951128 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1518) (512y:  128) (512z: 1942)
+     5,952,641,894      cycles                           #    1.919 GHz                    
+     8,707,382,958      instructions                     #    1.46  insn per cycle         
+       3.104614357 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1494) (512y:  128) (512z: 1942)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt
index 73fd47669c..6dfb3d97d4 100644
--- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
-DATE: 2024-06-28_21:12:36
+DATE: 2024-08-08_20:40:05
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.867031e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.100662e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.185332e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.936743e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.101495e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.185931e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     0.520675 sec
+TOTAL       :     0.520732 sec
 INFO: No Floating Point Exceptions have been reported
-     2,224,693,306      cycles                           #    2.963 GHz                    
-     3,176,876,470      instructions                     #    1.43  insn per cycle         
-       0.809131316 seconds time elapsed
+     2,199,613,002      cycles                           #    2.925 GHz                    
+     3,199,605,848      instructions                     #    1.45  insn per cycle         
+       0.808356541 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.773905e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.816144e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.816144e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.741086e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.782692e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.782692e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     6.020472 sec
+TOTAL       :     6.159994 sec
 INFO: No Floating Point Exceptions have been reported
-    18,445,965,429      cycles                           #    3.062 GHz                    
-    50,082,657,641      instructions                     #    2.72  insn per cycle         
-       6.025879163 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  639) (avx2:    0) (512y:    0) (512z:    0)
+    18,606,289,146      cycles                           #    3.016 GHz                    
+    50,188,372,015      instructions                     #    2.70  insn per cycle         
+       6.169438178 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  626) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.193122e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.346686e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.346686e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.098336e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.247173e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.247173e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     3.392949 sec
+TOTAL       :     3.523816 sec
 INFO: No Floating Point Exceptions have been reported
-    10,374,405,143      cycles                           #    3.054 GHz                    
-    29,167,609,202      instructions                     #    2.81  insn per cycle         
-       3.398383511 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2747) (avx2:    0) (512y:    0) (512z:    0)
+    10,442,361,179      cycles                           #    2.956 GHz                    
+    29,279,251,351      instructions                     #    2.80  insn per cycle         
+       3.532990329 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2732) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.537528e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.839673e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.839673e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.443138e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.746940e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.746940e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     2.418900 sec
+TOTAL       :     2.497852 sec
 INFO: No Floating Point Exceptions have been reported
-     6,949,645,422      cycles                           #    2.868 GHz                    
-    15,150,928,033      instructions                     #    2.18  insn per cycle         
-       2.424241351 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3032) (512y:    0) (512z:    0)
+     7,066,085,833      cycles                           #    2.820 GHz                    
+    15,266,746,500      instructions                     #    2.16  insn per cycle         
+       2.506843234 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3014) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.710563e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.035488e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.035488e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.619490e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.939857e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.939857e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     2.334255 sec
+TOTAL       :     2.408665 sec
 INFO: No Floating Point Exceptions have been reported
-     6,712,488,345      cycles                           #    2.870 GHz                    
-    14,622,404,295      instructions                     #    2.18  insn per cycle         
-       2.339601480 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2634) (512y:  302) (512z:    0)
+     6,801,023,817      cycles                           #    2.814 GHz                    
+    14,741,025,083      instructions                     #    2.17  insn per cycle         
+       2.418105582 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2610) (512y:  302) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.519771e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.696832e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.696832e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.467108e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.646231e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.646231e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     3.089786 sec
+TOTAL       :     3.162174 sec
 INFO: No Floating Point Exceptions have been reported
-     6,048,674,512      cycles                           #    1.955 GHz                    
-    10,341,337,272      instructions                     #    1.71  insn per cycle         
-       3.095220098 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1280) (512y:  214) (512z: 2129)
+     6,163,693,414      cycles                           #    1.944 GHz                    
+    10,458,436,313      instructions                     #    1.70  insn per cycle         
+       3.171538437 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1256) (512y:  214) (512z: 2129)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt
index 2f4b520747..f2fae03e6f 100644
--- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
-DATE: 2024-06-28_21:13:02
+DATE: 2024-08-08_20:40:31
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.117967e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.014428e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.165976e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.265904e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.014084e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.164702e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 7.154219e+00 +- 1.620281e-01 )  GeV^0
-TOTAL       :     0.475599 sec
+TOTAL       :     0.479298 sec
 INFO: No Floating Point Exceptions have been reported
-     2,098,451,460      cycles                           #    2.983 GHz                    
-     2,991,055,796      instructions                     #    1.43  insn per cycle         
-       0.760185973 seconds time elapsed
+     2,081,740,099      cycles                           #    2.923 GHz                    
+     2,980,788,530      instructions                     #    1.43  insn per cycle         
+       0.769444492 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 157
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.731627e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.773602e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.773602e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.729175e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.771417e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.771417e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.175644e+00 +- 1.658767e-01 )  GeV^0
-TOTAL       :     6.141422 sec
+TOTAL       :     6.156936 sec
 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-    18,599,220,732      cycles                           #    3.027 GHz                    
-    51,234,674,126      instructions                     #    2.75  insn per cycle         
-       6.146514341 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  638) (avx2:    0) (512y:    0) (512z:    0)
+    18,595,330,502      cycles                           #    3.018 GHz                    
+    51,251,959,778      instructions                     #    2.76  insn per cycle         
+       6.163337596 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  625) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -110,20 +113,21 @@ Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.173906e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.454050e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.454050e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.099341e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.368380e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.368380e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.175642e+00 +- 1.658767e-01 )  GeV^0
-TOTAL       :     2.599505 sec
+TOTAL       :     2.652061 sec
 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     7,952,125,495      cycles                           #    3.054 GHz                    
-    19,320,215,609      instructions                     #    2.43  insn per cycle         
-       2.604647260 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3555) (avx2:    0) (512y:    0) (512z:    0)
+     7,973,155,362      cycles                           #    3.000 GHz                    
+    19,354,832,142      instructions                     #    2.43  insn per cycle         
+       2.658432650 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3543) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -140,20 +144,21 @@ Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.227356e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.294161e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.294161e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.856741e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.854878e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.854878e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.198861e+00 +- 1.710281e-01 )  GeV^0
-TOTAL       :     1.359948 sec
+TOTAL       :     1.428829 sec
 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     3,954,758,474      cycles                           #    2.898 GHz                    
-     8,835,602,425      instructions                     #    2.23  insn per cycle         
-       1.365174055 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3719) (512y:    0) (512z:    0)
+     4,050,150,212      cycles                           #    2.823 GHz                    
+     8,874,617,638      instructions                     #    2.19  insn per cycle         
+       1.435345706 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3701) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -168,20 +173,21 @@ Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.666240e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.854739e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.854739e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.579308e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.783002e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.783002e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.198861e+00 +- 1.710281e-01 )  GeV^0
-TOTAL       :     1.294891 sec
+TOTAL       :     1.316483 sec
 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     3,727,185,027      cycles                           #    2.869 GHz                    
-     8,438,200,999      instructions                     #    2.26  insn per cycle         
-       1.300078595 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3555) (512y:   20) (512z:    0)
+     3,770,202,308      cycles                           #    2.852 GHz                    
+     8,473,429,912      instructions                     #    2.25  insn per cycle         
+       1.322971561 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3531) (512y:   20) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -196,20 +202,21 @@ Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.299305e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.889301e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.889301e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.340113e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.941423e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.941423e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.198861e+00 +- 1.710281e-01 )  GeV^0
-TOTAL       :     1.751707 sec
+TOTAL       :     1.746808 sec
 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     3,532,326,839      cycles                           #    2.012 GHz                    
-     6,249,861,118      instructions                     #    1.77  insn per cycle         
-       1.756948421 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2453) (512y:   32) (512z: 2288)
+     3,535,492,788      cycles                           #    2.017 GHz                    
+     6,276,858,891      instructions                     #    1.78  insn per cycle         
+       1.753255052 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2373) (512y:   24) (512z: 2288)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt
index 0a5421b6bf..0a0273143f 100644
--- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
-DATE: 2024-06-28_21:13:23
+DATE: 2024-08-08_20:40:52
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.138769e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.031502e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.196830e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.367628e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.048579e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.197733e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 7.154219e+00 +- 1.620281e-01 )  GeV^0
-TOTAL       :     0.482359 sec
+TOTAL       :     0.477604 sec
 INFO: No Floating Point Exceptions have been reported
-     2,078,305,223      cycles                           #    2.945 GHz                    
-     2,939,956,772      instructions                     #    1.41  insn per cycle         
-       0.764061380 seconds time elapsed
+     2,076,219,464      cycles                           #    2.927 GHz                    
+     2,975,745,460      instructions                     #    1.43  insn per cycle         
+       0.766187526 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 131
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.770832e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.814865e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.814865e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.736285e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.779068e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.779068e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.175644e+00 +- 1.658767e-01 )  GeV^0
-TOTAL       :     6.006723 sec
+TOTAL       :     6.132525 sec
 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-    18,001,534,135      cycles                           #    2.995 GHz                    
-    49,621,174,694      instructions                     #    2.76  insn per cycle         
-       6.011974065 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  626) (avx2:    0) (512y:    0) (512z:    0)
+    18,052,449,940      cycles                           #    2.941 GHz                    
+    49,636,091,735      instructions                     #    2.75  insn per cycle         
+       6.138910377 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  613) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -110,20 +113,21 @@ Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.694395e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.041943e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.041943e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.614737e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.962775e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.962775e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.175642e+00 +- 1.658767e-01 )  GeV^0
-TOTAL       :     2.321218 sec
+TOTAL       :     2.366728 sec
 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     7,097,809,138      cycles                           #    3.052 GHz                    
-    18,485,382,753      instructions                     #    2.60  insn per cycle         
-       2.326522398 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3247) (avx2:    0) (512y:    0) (512z:    0)
+     7,117,859,932      cycles                           #    3.001 GHz                    
+    18,522,428,859      instructions                     #    2.60  insn per cycle         
+       2.373189090 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3235) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -140,20 +144,21 @@ Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.589453e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.061103e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.061103e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.520738e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.991057e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.991057e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.198861e+00 +- 1.710281e-01 )  GeV^0
-TOTAL       :     1.962194 sec
+TOTAL       :     1.992175 sec
 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     5,644,401,281      cycles                           #    2.870 GHz                    
-    10,850,847,216      instructions                     #    1.92  insn per cycle         
-       1.967315204 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4278) (512y:    0) (512z:    0)
+     5,687,734,724      cycles                           #    2.847 GHz                    
+    10,882,767,796      instructions                     #    1.91  insn per cycle         
+       1.998751657 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4260) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -170,20 +175,21 @@ Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.669808e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.159078e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.159078e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.605855e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.093953e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.093953e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.198861e+00 +- 1.710281e-01 )  GeV^0
-TOTAL       :     1.936979 sec
+TOTAL       :     1.963543 sec
 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     5,562,205,138      cycles                           #    2.865 GHz                    
-    10,548,817,439      instructions                     #    1.90  insn per cycle         
-       1.942344639 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4147) (512y:   12) (512z:    0)
+     5,605,481,105      cycles                           #    2.846 GHz                    
+    10,580,081,810      instructions                     #    1.89  insn per cycle         
+       1.969981859 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4123) (512y:   12) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -200,20 +206,21 @@ Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.667808e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.981925e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.981925e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.560324e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.865892e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.865892e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.198861e+00 +- 1.710281e-01 )  GeV^0
-TOTAL       :     2.332449 sec
+TOTAL       :     2.392840 sec
 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     4,630,982,511      cycles                           #    1.982 GHz                    
-     8,663,797,017      instructions                     #    1.87  insn per cycle         
-       2.337797442 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2929) (512y:    8) (512z: 2883)
+     4,694,796,569      cycles                           #    1.957 GHz                    
+     8,695,099,464      instructions                     #    1.85  insn per cycle         
+       2.399389128 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2849) (512y:    0) (512z: 2883)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt
index 4f10d746fb..62d3c322fa 100644
--- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
-DATE: 2024-06-28_21:13:46
+DATE: 2024-08-08_20:41:15
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.884315e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.101968e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.186528e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.961744e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.101148e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.184921e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     0.518718 sec
+TOTAL       :     0.519363 sec
 INFO: No Floating Point Exceptions have been reported
-     2,224,545,388      cycles                           #    2.964 GHz                    
-     3,211,728,542      instructions                     #    1.44  insn per cycle         
-       0.807600897 seconds time elapsed
+     2,191,794,568      cycles                           #    2.919 GHz                    
+     3,157,238,703      instructions                     #    1.44  insn per cycle         
+       0.807852407 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 226
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.579495e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.613967e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.613967e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.547380e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.581051e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.581051e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     6.750243 sec
+TOTAL       :     6.917943 sec
 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-    20,483,878,120      cycles                           #    3.033 GHz                    
-    51,951,197,139      instructions                     #    2.54  insn per cycle         
-       6.755548112 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  668) (avx2:    0) (512y:    0) (512z:    0)
+    20,590,059,617      cycles                           #    2.973 GHz                    
+    52,050,938,989      instructions                     #    2.53  insn per cycle         
+       6.927193752 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  655) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -110,20 +113,21 @@ Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.806286e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.923526e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.923526e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.762310e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.879212e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.879212e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     3.845987 sec
+TOTAL       :     3.935303 sec
 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-    11,542,409,923      cycles                           #    2.998 GHz                    
-    30,594,439,566      instructions                     #    2.65  insn per cycle         
-       3.851470148 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2982) (avx2:    0) (512y:    0) (512z:    0)
+    11,659,111,162      cycles                           #    2.956 GHz                    
+    30,715,351,599      instructions                     #    2.63  insn per cycle         
+       3.944612578 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2970) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -140,20 +144,21 @@ Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.722824e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.056906e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.056906e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.631108e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.954751e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.954751e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     2.331102 sec
+TOTAL       :     2.401648 sec
 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     6,688,713,601      cycles                           #    2.864 GHz                    
-    13,612,289,001      instructions                     #    2.04  insn per cycle         
-       2.336516500 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3124) (512y:    0) (512z:    0)
+     6,824,462,536      cycles                           #    2.832 GHz                    
+    13,725,309,322      instructions                     #    2.01  insn per cycle         
+       2.410817230 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3106) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -170,20 +175,21 @@ Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.154620e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.545538e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.545538e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.105035e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.496184e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.496184e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     2.192487 sec
+TOTAL       :     2.189054 sec
 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     6,160,306,610      cycles                           #    2.843 GHz                    
-    12,983,189,743      instructions                     #    2.11  insn per cycle         
-       2.199017366 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2863) (512y:  150) (512z:    0)
+     6,256,988,161      cycles                           #    2.848 GHz                    
+    13,091,196,075      instructions                     #    2.09  insn per cycle         
+       2.197929864 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2839) (512y:  150) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -200,20 +206,21 @@ Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.308189e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.463723e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.463723e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.274756e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.429596e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.429596e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     3.278659 sec
+TOTAL       :     3.340001 sec
 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     6,408,069,755      cycles                           #    1.952 GHz                    
-     8,704,551,378      instructions                     #    1.36  insn per cycle         
-       3.284062400 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1815) (512y:  134) (512z: 2012)
+     6,530,704,290      cycles                           #    1.951 GHz                    
+     8,820,931,604      instructions                     #    1.35  insn per cycle         
+       3.348983212 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1769) (512y:  130) (512z: 2012)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt
index 0521944df3..8f692fc05c 100644
--- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
-DATE: 2024-06-28_21:14:12
+DATE: 2024-08-08_20:41:42
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.848166e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.099691e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.185849e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.985439e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.104211e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.186889e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     0.523260 sec
+TOTAL       :     0.520398 sec
 INFO: No Floating Point Exceptions have been reported
-     2,233,257,640      cycles                           #    2.961 GHz                    
-     3,232,811,415      instructions                     #    1.45  insn per cycle         
-       0.813371271 seconds time elapsed
+     2,215,259,816      cycles                           #    2.943 GHz                    
+     3,181,112,910      instructions                     #    1.44  insn per cycle         
+       0.810106845 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.667950e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.705744e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.705744e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.642914e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.679857e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.679857e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     6.397356 sec
+TOTAL       :     6.520897 sec
 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-    19,513,033,741      cycles                           #    3.048 GHz                    
-    49,980,418,654      instructions                     #    2.56  insn per cycle         
-       6.402937387 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  612) (avx2:    0) (512y:    0) (512z:    0)
+    19,742,813,002      cycles                           #    3.024 GHz                    
+    50,090,585,504      instructions                     #    2.54  insn per cycle         
+       6.530114912 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  599) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -110,20 +113,21 @@ Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.929554e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.057191e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.057191e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.996801e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.132711e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.132711e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     3.687798 sec
+TOTAL       :     3.635789 sec
 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-    10,975,545,528      cycles                           #    2.972 GHz                    
-    29,099,576,131      instructions                     #    2.65  insn per cycle         
-       3.693239239 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2818) (avx2:    0) (512y:    0) (512z:    0)
+    11,015,177,767      cycles                           #    3.023 GHz                    
+    29,218,453,275      instructions                     #    2.65  insn per cycle         
+       3.644811061 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2806) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -140,20 +144,21 @@ Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.763591e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.971776e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.971776e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.818882e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.034730e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.034730e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     2.898131 sec
+TOTAL       :     2.883629 sec
 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     8,065,988,902      cycles                           #    2.779 GHz                    
-    15,176,934,150      instructions                     #    1.88  insn per cycle         
-       2.903385335 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3208) (512y:    0) (512z:    0)
+     8,167,532,623      cycles                           #    2.824 GHz                    
+    15,289,290,626      instructions                     #    1.87  insn per cycle         
+       2.892785978 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3190) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -170,20 +175,21 @@ Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.100432e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.344362e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.344362e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.019354e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.261718e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.261718e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     2.667556 sec
+TOTAL       :     2.748891 sec
 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     7,691,086,412      cycles                           #    2.878 GHz                    
-    14,485,100,498      instructions                     #    1.88  insn per cycle         
-       2.672916815 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2786) (512y:  304) (512z:    0)
+     7,796,139,330      cycles                           #    2.827 GHz                    
+    14,598,894,712      instructions                     #    1.87  insn per cycle         
+       2.758146376 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2762) (512y:  304) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -200,20 +206,21 @@ Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.246284e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.395334e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.395334e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.130478e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.273768e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.273768e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     3.338576 sec
+TOTAL       :     3.488340 sec
 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     6,541,048,191      cycles                           #    1.957 GHz                    
-     9,899,451,402      instructions                     #    1.51  insn per cycle         
-       3.343805399 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1588) (512y:  220) (512z: 2216)
+     6,648,747,235      cycles                           #    1.902 GHz                    
+    10,013,894,735      instructions                     #    1.51  insn per cycle         
+       3.497416797 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1542) (512y:  216) (512z: 2216)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt
index 90c6b8c61e..ad80cd52ba 100644
--- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
-DATE: 2024-06-28_21:11:10
+DATE: 2024-08-08_20:38:36
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.193723e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.215926e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.219404e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.191569e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.214197e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.217917e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.461231 sec
+TOTAL       :     0.458797 sec
 INFO: No Floating Point Exceptions have been reported
-     2,031,756,874      cycles                           #    2.969 GHz                    
-     2,962,940,364      instructions                     #    1.46  insn per cycle         
-       0.741692237 seconds time elapsed
+     1,983,013,526      cycles                           #    2.927 GHz                    
+     2,898,600,678      instructions                     #    1.46  insn per cycle         
+       0.735167670 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.824383e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.989844e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.000074e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.853741e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.992878e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.001850e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 8.048215e-03 +- 4.042405e-03 )  GeV^-4
-TOTAL       :     0.477201 sec
+TOTAL       :     0.478795 sec
 INFO: No Floating Point Exceptions have been reported
-     2,063,797,965      cycles                           #    2.957 GHz                    
-     3,051,978,094      instructions                     #    1.48  insn per cycle         
-       0.754833361 seconds time elapsed
+     2,032,935,359      cycles                           #    2.895 GHz                    
+     3,002,750,539      instructions                     #    1.48  insn per cycle         
+       0.759651454 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.546074e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.549505e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.549505e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.535539e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.539012e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.539012e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.154149 sec
+TOTAL       :     0.151546 sec
 INFO: No Floating Point Exceptions have been reported
-       476,458,242      cycles                           #    3.028 GHz                    
-     1,396,962,991      instructions                     #    2.93  insn per cycle         
-       0.158033335 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3921) (avx2:    0) (512y:    0) (512z:    0)
+       468,124,472      cycles                           #    3.026 GHz                    
+     1,389,955,355      instructions                     #    2.97  insn per cycle         
+       0.155210727 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3908) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.769944e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.782647e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.782647e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.637495e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.649053e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.649053e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.082624 sec
+TOTAL       :     0.081392 sec
 INFO: No Floating Point Exceptions have been reported
-       247,067,561      cycles                           #    2.873 GHz                    
-       700,108,508      instructions                     #    2.83  insn per cycle         
-       0.086505657 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 9495) (avx2:    0) (512y:    0) (512z:    0)
+       240,371,597      cycles                           #    2.843 GHz                    
+       693,129,674      instructions                     #    2.88  insn per cycle         
+       0.085091876 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 9483) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.495648e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.501499e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.501499e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.470591e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.476735e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.476735e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.040068 sec
+TOTAL       :     0.038239 sec
 INFO: No Floating Point Exceptions have been reported
-       121,183,823      cycles                           #    2.800 GHz                    
-       265,080,473      instructions                     #    2.19  insn per cycle         
-       0.043900082 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8514) (512y:    0) (512z:    0)
+       114,892,967      cycles                           #    2.759 GHz                    
+       258,045,984      instructions                     #    2.25  insn per cycle         
+       0.042251807 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8496) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.687162e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.695060e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.695060e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.699002e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.707705e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.707705e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.036254 sec
+TOTAL       :     0.033054 sec
 INFO: No Floating Point Exceptions have been reported
-       109,659,099      cycles                           #    2.745 GHz                    
-       247,206,412      instructions                     #    2.25  insn per cycle         
-       0.040604680 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8157) (512y:  150) (512z:    0)
+       102,370,235      cycles                           #    2.829 GHz                    
+       240,205,792      instructions                     #    2.35  insn per cycle         
+       0.036714327 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8133) (512y:  150) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.223877e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.228733e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.228733e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.284659e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.290558e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.290558e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.048129 sec
+TOTAL       :     0.043329 sec
 INFO: No Floating Point Exceptions have been reported
-        97,361,748      cycles                           #    1.896 GHz                    
-       141,545,344      instructions                     #    1.45  insn per cycle         
-       0.051936443 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1955) (512y:  126) (512z: 7089)
+        89,664,319      cycles                           #    1.930 GHz                    
+       134,445,525      instructions                     #    1.50  insn per cycle         
+       0.047102954 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1931) (512y:  126) (512z: 7089)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt
index 13930c5a4b..ce829c6200 100644
--- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
-DATE: 2024-06-28_21:11:21
+DATE: 2024-08-08_20:38:46
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.238168e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.265600e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.269385e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.249020e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.272842e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.276725e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.464107 sec
+TOTAL       :     0.461905 sec
 INFO: No Floating Point Exceptions have been reported
-     1,996,708,175      cycles                           #    2.888 GHz                    
-     2,944,414,956      instructions                     #    1.47  insn per cycle         
-       0.748472969 seconds time elapsed
+     2,018,577,231      cycles                           #    2.927 GHz                    
+     2,882,435,680      instructions                     #    1.43  insn per cycle         
+       0.748301491 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.940461e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.101663e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.112629e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.955136e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.095621e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.108051e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 8.048215e-03 +- 4.042405e-03 )  GeV^-4
-TOTAL       :     0.480375 sec
+TOTAL       :     0.478584 sec
 INFO: No Floating Point Exceptions have been reported
-     2,101,513,196      cycles                           #    2.947 GHz                    
-     3,103,578,303      instructions                     #    1.48  insn per cycle         
-       0.770268851 seconds time elapsed
+     2,069,849,202      cycles                           #    2.946 GHz                    
+     3,022,582,128      instructions                     #    1.46  insn per cycle         
+       0.760103886 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.331366e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.334564e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.334564e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.498608e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.502028e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.502028e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.163520 sec
+TOTAL       :     0.152353 sec
 INFO: No Floating Point Exceptions have been reported
-       475,742,235      cycles                           #    2.848 GHz                    
-     1,392,453,483      instructions                     #    2.93  insn per cycle         
-       0.167648338 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3809) (avx2:    0) (512y:    0) (512z:    0)
+       465,735,866      cycles                           #    2.994 GHz                    
+     1,385,207,858      instructions                     #    2.97  insn per cycle         
+       0.156142730 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3796) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.895678e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.906547e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.906547e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.699480e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.712661e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.712661e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.093681 sec
+TOTAL       :     0.080022 sec
 INFO: No Floating Point Exceptions have been reported
-       247,202,379      cycles                           #    2.547 GHz                    
-       696,396,818      instructions                     #    2.82  insn per cycle         
-       0.097666857 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 9540) (avx2:    0) (512y:    0) (512z:    0)
+       238,839,052      cycles                           #    2.875 GHz                    
+       689,228,820      instructions                     #    2.89  insn per cycle         
+       0.083649102 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 9528) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.477425e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.483400e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.483400e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.515936e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.522249e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.522249e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.039763 sec
+TOTAL       :     0.036065 sec
 INFO: No Floating Point Exceptions have been reported
-       119,037,303      cycles                           #    2.761 GHz                    
-       260,641,164      instructions                     #    2.19  insn per cycle         
-       0.043647510 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8469) (512y:    0) (512z:    0)
+       111,582,476      cycles                           #    2.848 GHz                    
+       253,551,951      instructions                     #    2.27  insn per cycle         
+       0.039739897 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8451) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.633386e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.640521e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.640521e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.680034e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.687653e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.687653e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.036416 sec
+TOTAL       :     0.032732 sec
 INFO: No Floating Point Exceptions have been reported
-       107,502,149      cycles                           #    2.701 GHz                    
-       242,803,467      instructions                     #    2.26  insn per cycle         
-       0.040331301 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8115) (512y:  150) (512z:    0)
+       100,255,842      cycles                           #    2.793 GHz                    
+       235,731,789      instructions                     #    2.35  insn per cycle         
+       0.036414093 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8091) (512y:  150) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.272818e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.278041e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.278041e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.271489e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.276895e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.276895e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.045889 sec
+TOTAL       :     0.042973 sec
 INFO: No Floating Point Exceptions have been reported
-        94,930,532      cycles                           #    1.931 GHz                    
-       136,876,173      instructions                     #    1.44  insn per cycle         
-       0.049671016 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1911) (512y:  126) (512z: 7093)
+        87,728,536      cycles                           #    1.900 GHz                    
+       129,884,935      instructions                     #    1.48  insn per cycle         
+       0.046739732 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1887) (512y:  126) (512z: 7093)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt
index e0238fd00f..3f66e78e98 100644
--- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
-DATE: 2024-06-28_21:11:31
+DATE: 2024-08-08_20:38:57
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.451775e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.461985e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.464542e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.450134e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.460503e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.463108e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.188141e-04 +- 6.565202e-04 )  GeV^-4
-TOTAL       :     0.464407 sec
+TOTAL       :     0.461786 sec
 INFO: No Floating Point Exceptions have been reported
-     2,018,377,870      cycles                           #    2.962 GHz                    
-     2,930,359,221      instructions                     #    1.45  insn per cycle         
-       0.738289299 seconds time elapsed
+     1,983,576,716      cycles                           #    2.936 GHz                    
+     2,917,710,082      instructions                     #    1.47  insn per cycle         
+       0.732112148 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.084747e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.210078e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.222065e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.144453e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.248650e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.259538e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 8.020494e-03 +- 4.025605e-03 )  GeV^-4
-TOTAL       :     0.466023 sec
+TOTAL       :     0.468413 sec
 INFO: No Floating Point Exceptions have been reported
-     2,038,104,278      cycles                           #    2.956 GHz                    
-     2,947,994,249      instructions                     #    1.45  insn per cycle         
-       0.746657233 seconds time elapsed
+     2,017,794,611      cycles                           #    2.933 GHz                    
+     2,930,677,889      instructions                     #    1.45  insn per cycle         
+       0.746841147 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.615276e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.618884e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.618884e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.555756e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.559328e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.559328e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.177153e-04 +- 6.554185e-04 )  GeV^-4
-TOTAL       :     0.151078 sec
+TOTAL       :     0.150880 sec
 INFO: No Floating Point Exceptions have been reported
-       471,303,235      cycles                           #    3.055 GHz                    
-     1,389,260,982      instructions                     #    2.95  insn per cycle         
-       0.154884128 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3071) (avx2:    0) (512y:    0) (512z:    0)
+       463,646,900      cycles                           #    3.010 GHz                    
+     1,382,054,083      instructions                     #    2.98  insn per cycle         
+       0.154571759 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3058) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.276596e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.281299e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.281299e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.231675e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.235936e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.235936e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.177152e-04 +- 6.554185e-04 )  GeV^-4
-TOTAL       :     0.046101 sec
+TOTAL       :     0.044706 sec
 INFO: No Floating Point Exceptions have been reported
-       140,089,332      cycles                           #    2.837 GHz                    
-       379,194,848      instructions                     #    2.71  insn per cycle         
-       0.050025437 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:10152) (avx2:    0) (512y:    0) (512z:    0)
+       132,862,579      cycles                           #    2.773 GHz                    
+       372,176,524      instructions                     #    2.80  insn per cycle         
+       0.048442327 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:10140) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.878046e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.901759e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.901759e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.891678e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.915961e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.915961e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.165746e-04 +- 6.542823e-04 )  GeV^-4
-TOTAL       :     0.023061 sec
+TOTAL       :     0.020296 sec
 INFO: No Floating Point Exceptions have been reported
-        72,082,912      cycles                           #    2.736 GHz                    
-       149,966,673      instructions                     #    2.08  insn per cycle         
-       0.026964267 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 9255) (512y:    0) (512z:    0)
+        65,005,087      cycles                           #    2.776 GHz                    
+       142,918,773      instructions                     #    2.20  insn per cycle         
+       0.023971535 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 9237) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.253397e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.283698e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.283698e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.201047e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.231393e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.231393e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.165746e-04 +- 6.542823e-04 )  GeV^-4
-TOTAL       :     0.020929 sec
+TOTAL       :     0.018450 sec
 INFO: No Floating Point Exceptions have been reported
-        66,702,269      cycles                           #    2.763 GHz                    
-       139,955,505      instructions                     #    2.10  insn per cycle         
-       0.024643778 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8975) (512y:   28) (512z:    0)
+        59,790,078      cycles                           #    2.765 GHz                    
+       132,888,839      instructions                     #    2.22  insn per cycle         
+       0.022153075 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8951) (512y:   28) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.540853e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.563913e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.563913e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.264475e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.284066e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.284066e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.165747e-04 +- 6.542824e-04 )  GeV^-4
-TOTAL       :     0.025752 sec
+TOTAL       :     0.025826 sec
 INFO: No Floating Point Exceptions have been reported
-        59,485,252      cycles                           #    2.054 GHz                    
-        86,826,640      instructions                     #    1.46  insn per cycle         
-       0.029598069 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2837) (512y:   32) (512z: 7440)
+        53,398,285      cycles                           #    1.814 GHz                    
+        80,038,410      instructions                     #    1.50  insn per cycle         
+       0.029948894 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2813) (512y:   32) (512z: 7440)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt
index 82cc1b864e..c0ec66c0e5 100644
--- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
-DATE: 2024-06-28_21:11:41
+DATE: 2024-08-08_20:39:07
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.484339e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.495870e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.498733e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.475468e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.488915e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.493523e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.188141e-04 +- 6.565202e-04 )  GeV^-4
-TOTAL       :     0.466683 sec
+TOTAL       :     0.466666 sec
 INFO: No Floating Point Exceptions have been reported
-     2,040,356,614      cycles                           #    2.967 GHz                    
-     2,942,177,277      instructions                     #    1.44  insn per cycle         
-       0.745417107 seconds time elapsed
+     2,035,784,320      cycles                           #    2.932 GHz                    
+     2,916,651,120      instructions                     #    1.43  insn per cycle         
+       0.752059618 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.335689e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.433529e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.444866e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.233883e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.341900e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.353294e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 8.020496e-03 +- 4.025606e-03 )  GeV^-4
-TOTAL       :     0.465336 sec
+TOTAL       :     0.467271 sec
 INFO: No Floating Point Exceptions have been reported
-     2,039,216,026      cycles                           #    2.964 GHz                    
-     2,973,292,818      instructions                     #    1.46  insn per cycle         
-       0.744817103 seconds time elapsed
+     2,037,159,179      cycles                           #    2.946 GHz                    
+     2,882,523,885      instructions                     #    1.41  insn per cycle         
+       0.747816184 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.582023e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.585808e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.585808e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.551604e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.554949e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.554949e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.177153e-04 +- 6.554185e-04 )  GeV^-4
-TOTAL       :     0.152028 sec
+TOTAL       :     0.149984 sec
 INFO: No Floating Point Exceptions have been reported
-       468,837,171      cycles                           #    3.022 GHz                    
-     1,384,022,139      instructions                     #    2.95  insn per cycle         
-       0.155930676 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2943) (avx2:    0) (512y:    0) (512z:    0)
+       461,532,447      cycles                           #    3.013 GHz                    
+     1,376,849,888      instructions                     #    2.98  insn per cycle         
+       0.153697004 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2930) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.265822e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.270876e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.270876e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.248118e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.252450e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.252450e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.177152e-04 +- 6.554185e-04 )  GeV^-4
-TOTAL       :     0.045875 sec
+TOTAL       :     0.043499 sec
 INFO: No Floating Point Exceptions have been reported
-       137,352,487      cycles                           #    2.799 GHz                    
-       374,377,520      instructions                     #    2.73  insn per cycle         
-       0.049677166 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:10135) (avx2:    0) (512y:    0) (512z:    0)
+       130,431,744      cycles                           #    2.801 GHz                    
+       367,402,317      instructions                     #    2.82  insn per cycle         
+       0.047010449 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:10123) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.841707e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.865345e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.865345e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.883527e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.907714e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.907714e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.165746e-04 +- 6.542823e-04 )  GeV^-4
-TOTAL       :     0.022453 sec
+TOTAL       :     0.019514 sec
 INFO: No Floating Point Exceptions have been reported
-        70,160,896      cycles                           #    2.725 GHz                    
-       145,241,864      instructions                     #    2.07  insn per cycle         
-       0.026324679 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 9209) (512y:    0) (512z:    0)
+        62,991,896      cycles                           #    2.777 GHz                    
+       138,167,276      instructions                     #    2.19  insn per cycle         
+       0.023246200 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 9191) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.165204e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.204619e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.204619e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.044826e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.071557e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.071557e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.165746e-04 +- 6.542823e-04 )  GeV^-4
-TOTAL       :     0.020717 sec
+TOTAL       :     0.018654 sec
 INFO: No Floating Point Exceptions have been reported
-        64,835,417      cycles                           #    2.707 GHz                    
-       135,143,508      instructions                     #    2.08  insn per cycle         
-       0.024535534 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8931) (512y:   28) (512z:    0)
+        57,917,940      cycles                           #    2.662 GHz                    
+       128,096,344      instructions                     #    2.21  insn per cycle         
+       0.022204337 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8907) (512y:   28) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.455631e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.477792e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.477792e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.471457e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.494959e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.494959e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.165747e-04 +- 6.542824e-04 )  GeV^-4
-TOTAL       :     0.025693 sec
+TOTAL       :     0.022784 sec
 INFO: No Floating Point Exceptions have been reported
-        57,629,979      cycles                           #    1.987 GHz                    
-        82,051,182      instructions                     #    1.42  insn per cycle         
-       0.029515961 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2792) (512y:   32) (512z: 7442)
+        50,131,984      cycles                           #    1.927 GHz                    
+        74,930,459      instructions                     #    1.49  insn per cycle         
+       0.026643138 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2768) (512y:   32) (512z: 7442)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt
index ab9c454944..a1cf964e05 100644
--- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
-DATE: 2024-06-28_21:11:51
+DATE: 2024-08-08_20:39:18
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.175250e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.198055e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.201847e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.170281e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.193514e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.197230e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.461570 sec
+TOTAL       :     0.460249 sec
 INFO: No Floating Point Exceptions have been reported
-     2,036,984,155      cycles                           #    2.947 GHz                    
-     2,961,824,334      instructions                     #    1.45  insn per cycle         
-       0.749627742 seconds time elapsed
+     1,998,727,826      cycles                           #    2.929 GHz                    
+     2,887,597,557      instructions                     #    1.44  insn per cycle         
+       0.739044353 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.826361e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.956648e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.966084e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.840436e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.977655e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.986488e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 8.048215e-03 +- 4.042405e-03 )  GeV^-4
-TOTAL       :     0.478628 sec
+TOTAL       :     0.480871 sec
 INFO: No Floating Point Exceptions have been reported
-     2,094,008,815      cycles                           #    2.983 GHz                    
-     3,111,493,141      instructions                     #    1.49  insn per cycle         
-       0.759239035 seconds time elapsed
+     2,091,938,823      cycles                           #    2.936 GHz                    
+     3,079,530,757      instructions                     #    1.47  insn per cycle         
+       0.770600295 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.466352e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.469566e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.469566e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.326264e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.329481e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.329481e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.157420 sec
+TOTAL       :     0.161027 sec
 INFO: No Floating Point Exceptions have been reported
-       479,914,950      cycles                           #    2.988 GHz                    
-     1,405,491,495      instructions                     #    2.93  insn per cycle         
-       0.161161459 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3912) (avx2:    0) (512y:    0) (512z:    0)
+       471,923,848      cycles                           #    2.871 GHz                    
+     1,398,593,986      instructions                     #    2.96  insn per cycle         
+       0.164917375 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3899) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.853341e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.865812e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.865812e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.833451e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.846029e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.846029e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.081922 sec
+TOTAL       :     0.079301 sec
 INFO: No Floating Point Exceptions have been reported
-       243,063,658      cycles                           #    2.855 GHz                    
-       695,298,142      instructions                     #    2.86  insn per cycle         
-       0.085797726 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 9339) (avx2:    0) (512y:    0) (512z:    0)
+       236,478,249      cycles                           #    2.865 GHz                    
+       688,183,765      instructions                     #    2.91  insn per cycle         
+       0.083009452 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 9327) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.352946e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.358722e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.358722e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.464519e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.470938e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.470938e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.043797 sec
+TOTAL       :     0.038027 sec
 INFO: No Floating Point Exceptions have been reported
-       121,157,463      cycles                           #    2.599 GHz                    
-       260,506,764      instructions                     #    2.15  insn per cycle         
-       0.047884491 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8369) (512y:    0) (512z:    0)
+       113,380,965      cycles                           #    2.745 GHz                    
+       253,222,188      instructions                     #    2.23  insn per cycle         
+       0.041829832 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8351) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.602254e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.609503e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.609503e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.697656e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.705927e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.705927e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.037787 sec
+TOTAL       :     0.033099 sec
 INFO: No Floating Point Exceptions have been reported
-       108,207,768      cycles                           #    2.631 GHz                    
-       240,816,726      instructions                     #    2.23  insn per cycle         
-       0.041615626 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 7513) (512y:  146) (512z:    0)
+       100,842,922      cycles                           #    2.776 GHz                    
+       233,742,979      instructions                     #    2.32  insn per cycle         
+       0.036790218 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 7489) (512y:  146) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.262123e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.267316e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.267316e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.224753e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.229606e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.229606e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.046689 sec
+TOTAL       :     0.045294 sec
 INFO: No Floating Point Exceptions have been reported
-        98,110,559      cycles                           #    1.961 GHz                    
-       140,335,875      instructions                     #    1.43  insn per cycle         
-       0.050565799 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2085) (512y:  122) (512z: 6355)
+        90,903,043      cycles                           #    1.874 GHz                    
+       133,303,472      instructions                     #    1.47  insn per cycle         
+       0.049138947 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2061) (512y:  122) (512z: 6355)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt
index dbb1d046cf..e66260167e 100644
--- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
-DATE: 2024-06-28_21:12:01
+DATE: 2024-08-08_20:39:28
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.223463e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.251769e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.255527e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.209121e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.235715e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.239868e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.462071 sec
+TOTAL       :     0.460488 sec
 INFO: No Floating Point Exceptions have been reported
-     2,027,925,192      cycles                           #    2.945 GHz                    
-     2,938,582,017      instructions                     #    1.45  insn per cycle         
-       0.747329246 seconds time elapsed
+     1,999,748,612      cycles                           #    2.928 GHz                    
+     2,930,247,263      instructions                     #    1.47  insn per cycle         
+       0.740595703 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -67,18 +67,20 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.920161e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.062136e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.071425e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.929472e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.072806e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.082157e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 8.048215e-03 +- 4.042405e-03 )  GeV^-4
-TOTAL       :     0.478582 sec
+TOTAL       :     0.482161 sec
 INFO: No Floating Point Exceptions have been reported
-     2,106,891,234      cycles                           #    2.952 GHz                    
-     3,109,718,200      instructions                     #    1.48  insn per cycle         
-       0.770328620 seconds time elapsed
+     2,061,793,455      cycles                           #    2.911 GHz                    
+     3,015,555,211      instructions                     #    1.46  insn per cycle         
+       0.766758571 seconds time elapsed
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
@@ -95,20 +97,21 @@ Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.553282e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.556636e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.556636e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.493942e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.497215e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.497215e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.153094 sec
+TOTAL       :     0.152521 sec
 INFO: No Floating Point Exceptions have been reported
-       476,099,272      cycles                           #    3.050 GHz                    
-     1,401,016,604      instructions                     #    2.94  insn per cycle         
-       0.156845379 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3813) (avx2:    0) (512y:    0) (512z:    0)
+       469,652,977      cycles                           #    3.017 GHz                    
+     1,393,890,707      instructions                     #    2.97  insn per cycle         
+       0.156209215 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3800) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -123,20 +126,21 @@ Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 7.000972e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.014214e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.014214e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.875866e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.888668e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.888668e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.079486 sec
+TOTAL       :     0.077991 sec
 INFO: No Floating Point Exceptions have been reported
-       242,214,106      cycles                           #    2.928 GHz                    
-       691,275,602      instructions                     #    2.85  insn per cycle         
-       0.083233014 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 9372) (avx2:    0) (512y:    0) (512z:    0)
+       235,131,903      cycles                           #    2.896 GHz                    
+       684,356,235      instructions                     #    2.91  insn per cycle         
+       0.081716900 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 9360) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -151,20 +155,21 @@ Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.463005e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.469289e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.469289e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.472431e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.478529e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.478529e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.040105 sec
+TOTAL       :     0.037179 sec
 INFO: No Floating Point Exceptions have been reported
-       118,533,316      cycles                           #    2.738 GHz                    
-       255,884,042      instructions                     #    2.16  insn per cycle         
-       0.043890674 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8322) (512y:    0) (512z:    0)
+       111,325,082      cycles                           #    2.760 GHz                    
+       248,775,647      instructions                     #    2.23  insn per cycle         
+       0.040876097 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8304) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -179,20 +184,21 @@ Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.626081e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.633847e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.633847e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.697458e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.705090e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.705090e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.036586 sec
+TOTAL       :     0.032417 sec
 INFO: No Floating Point Exceptions have been reported
-       106,383,966      cycles                           #    2.677 GHz                    
-       236,394,538      instructions                     #    2.22  insn per cycle         
-       0.040350677 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 7464) (512y:  146) (512z:    0)
+        98,963,466      cycles                           #    2.782 GHz                    
+       229,303,120      instructions                     #    2.32  insn per cycle         
+       0.036104618 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 7440) (512y:  146) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -207,20 +213,21 @@ Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.248196e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.253173e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.253173e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.256457e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.261478e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.261478e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.046442 sec
+TOTAL       :     0.043443 sec
 INFO: No Floating Point Exceptions have been reported
-        95,760,022      cycles                           #    1.927 GHz                    
-       135,734,849      instructions                     #    1.42  insn per cycle         
-       0.050265152 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2036) (512y:  122) (512z: 6355)
+        88,868,110      cycles                           #    1.900 GHz                    
+       128,801,312      instructions                     #    1.45  insn per cycle         
+       0.047318950 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2012) (512y:  122) (512z: 6355)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt
index 60a21f7044..ef58048b29 100644
--- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
-DATE: 2024-07-23_17:01:29
+DATE: 2024-08-08_20:37:25
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.420342e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.313282e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.381713e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.665934e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.063349e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.406343e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.522832 sec
+TOTAL       :     0.506392 sec
 INFO: No Floating Point Exceptions have been reported
-     2,278,234,455      cycles                           #    2.959 GHz                    
-     3,205,581,944      instructions                     #    1.41  insn per cycle         
-       0.827280417 seconds time elapsed
+     2,172,824,039      cycles                           #    2.952 GHz                    
+     3,090,027,466      instructions                     #    1.42  insn per cycle         
+       0.793282296 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 132
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -82,15 +82,15 @@ Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inline
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.921532e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.019329e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.019329e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.134117e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.048218e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.048218e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     1.289962 sec
+TOTAL       :     1.290483 sec
 INFO: No Floating Point Exceptions have been reported
-     3,730,964,441      cycles                           #    2.882 GHz                    
-     9,721,293,781      instructions                     #    2.61  insn per cycle         
-       1.295775979 seconds time elapsed
+     3,847,248,044      cycles                           #    2.962 GHz                    
+     9,842,303,730      instructions                     #    2.56  insn per cycle         
+       1.299592545 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:  338) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/runTest_cpp.exe
@@ -111,15 +111,15 @@ Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inline
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.609878e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.093510e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.093510e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.531336e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.978158e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.978158e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.762977 sec
+TOTAL       :     0.826770 sec
 INFO: No Floating Point Exceptions have been reported
-     2,324,074,316      cycles                           #    3.026 GHz                    
-     5,927,719,424      instructions                     #    2.55  insn per cycle         
-       0.768610956 seconds time elapsed
+     2,453,692,398      cycles                           #    2.938 GHz                    
+     6,052,098,536      instructions                     #    2.47  insn per cycle         
+       0.835919362 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4: 1376) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/runTest_cpp.exe
@@ -140,15 +140,15 @@ Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inline
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.298759e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.386451e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.386451e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.266889e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.345995e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.345995e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.566149 sec
+TOTAL       :     0.606570 sec
 INFO: No Floating Point Exceptions have been reported
-     1,665,204,872      cycles                           #    2.915 GHz                    
-     3,310,874,764      instructions                     #    1.99  insn per cycle         
-       0.571828050 seconds time elapsed
+     1,785,899,086      cycles                           #    2.902 GHz                    
+     3,437,083,551      instructions                     #    1.92  insn per cycle         
+       0.616030368 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1492) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/runTest_cpp.exe
@@ -169,15 +169,15 @@ Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inline
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.374476e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.525659e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.525659e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.357485e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.522198e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.522198e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.552533 sec
+TOTAL       :     0.586533 sec
 INFO: No Floating Point Exceptions have been reported
-     1,626,012,950      cycles                           #    2.916 GHz                    
-     3,280,880,649      instructions                     #    2.02  insn per cycle         
-       0.558264683 seconds time elapsed
+     1,741,529,265      cycles                           #    2.926 GHz                    
+     3,407,397,649      instructions                     #    1.96  insn per cycle         
+       0.595838672 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1368) (512y:   96) (512z:    0)
 -------------------------------------------------------------------------
 runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/runTest_cpp.exe
@@ -198,15 +198,15 @@ Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inline
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.265036e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.279394e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.279394e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.227600e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.220282e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.220282e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.573090 sec
+TOTAL       :     0.613174 sec
 INFO: No Floating Point Exceptions have been reported
-     1,367,296,463      cycles                           #    2.365 GHz                    
-     2,420,374,484      instructions                     #    1.77  insn per cycle         
-       0.578843377 seconds time elapsed
+     1,478,751,325      cycles                           #    2.377 GHz                    
+     2,546,932,482      instructions                     #    1.72  insn per cycle         
+       0.622601431 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  568) (512y:   60) (512z: 1020)
 -------------------------------------------------------------------------
 runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/runTest_cpp.exe
diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt
index c85a2ac0a9..8c70303d63 100644
--- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
-DATE: 2024-07-23_17:01:41
+DATE: 2024-08-08_20:37:37
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.466951e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.767444e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.772228e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.814897e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.661637e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.796070e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.529266 sec
+TOTAL       :     0.507946 sec
 INFO: No Floating Point Exceptions have been reported
-     2,231,633,624      cycles                           #    2.901 GHz                    
-     3,111,971,882      instructions                     #    1.39  insn per cycle         
-       0.827158513 seconds time elapsed
+     2,214,460,924      cycles                           #    2.958 GHz                    
+     3,109,800,964      instructions                     #    1.40  insn per cycle         
+       0.807528636 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 124
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -82,15 +82,15 @@ Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inline
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.499434e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.087743e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.087743e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.340535e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.067339e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.067339e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     1.213256 sec
+TOTAL       :     1.264960 sec
 INFO: No Floating Point Exceptions have been reported
-     3,719,151,096      cycles                           #    3.053 GHz                    
-     9,602,056,797      instructions                     #    2.58  insn per cycle         
-       1.218964071 seconds time elapsed
+     3,833,057,387      cycles                           #    3.009 GHz                    
+     9,733,259,839      instructions                     #    2.54  insn per cycle         
+       1.274559461 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:  356) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/runTest_cpp.exe
@@ -111,15 +111,15 @@ Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inline
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.600652e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.094506e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.094506e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.542135e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.989720e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.989720e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.765607 sec
+TOTAL       :     0.822438 sec
 INFO: No Floating Point Exceptions have been reported
-     2,343,304,789      cycles                           #    3.043 GHz                    
-     5,873,196,914      instructions                     #    2.51  insn per cycle         
-       0.771257250 seconds time elapsed
+     2,444,623,828      cycles                           #    2.942 GHz                    
+     6,004,739,844      instructions                     #    2.46  insn per cycle         
+       0.831745892 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4: 1342) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/runTest_cpp.exe
@@ -140,15 +140,15 @@ Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inline
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.294689e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.380462e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.380462e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.232544e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.257016e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.257016e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.566032 sec
+TOTAL       :     0.613019 sec
 INFO: No Floating Point Exceptions have been reported
-     1,659,009,853      cycles                           #    2.906 GHz                    
-     3,283,617,543      instructions                     #    1.98  insn per cycle         
-       0.571506758 seconds time elapsed
+     1,777,339,853      cycles                           #    2.859 GHz                    
+     3,416,813,174      instructions                     #    1.92  insn per cycle         
+       0.622385987 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1429) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/runTest_cpp.exe
@@ -169,15 +169,15 @@ Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inline
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.264254e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.341747e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.341747e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.366185e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.542246e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.542246e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.577127 sec
+TOTAL       :     0.584170 sec
 INFO: No Floating Point Exceptions have been reported
-     1,625,973,657      cycles                           #    2.793 GHz                    
-     3,257,709,469      instructions                     #    2.00  insn per cycle         
-       0.582960679 seconds time elapsed
+     1,729,011,734      cycles                           #    2.917 GHz                    
+     3,386,515,960      instructions                     #    1.96  insn per cycle         
+       0.593372914 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1321) (512y:   96) (512z:    0)
 -------------------------------------------------------------------------
 runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/runTest_cpp.exe
@@ -198,15 +198,15 @@ Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inline
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.108767e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.019417e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.019417e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.212793e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.204561e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.204561e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.610248 sec
+TOTAL       :     0.617575 sec
 INFO: No Floating Point Exceptions have been reported
-     1,368,423,664      cycles                           #    2.224 GHz                    
-     2,405,748,439      instructions                     #    1.76  insn per cycle         
-       0.616101703 seconds time elapsed
+     1,500,885,532      cycles                           #    2.396 GHz                    
+     2,536,856,422      instructions                     #    1.69  insn per cycle         
+       0.627161657 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  535) (512y:   60) (512z: 1006)
 -------------------------------------------------------------------------
 runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/runTest_cpp.exe
diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt
index 4cba4c8f17..854849f5b9 100644
--- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
-DATE: 2024-07-23_17:01:53
+DATE: 2024-08-08_20:37:49
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.067979e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.299959e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.734201e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.471582e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.082860e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.730798e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486732e-01 +- 3.293572e-05 )  GeV^0
-TOTAL       :     0.481457 sec
+TOTAL       :     0.477544 sec
 INFO: No Floating Point Exceptions have been reported
-     2,093,279,906      cycles                           #    2.954 GHz                    
-     2,954,954,358      instructions                     #    1.41  insn per cycle         
-       0.765733186 seconds time elapsed
+     2,060,886,859      cycles                           #    2.928 GHz                    
+     2,892,344,882      instructions                     #    1.40  insn per cycle         
+       0.762313323 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 100
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -82,15 +82,15 @@ Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inline
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.379309e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.076034e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.076034e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.384427e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.077691e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.077691e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486735e-01 +- 3.293563e-05 )  GeV^0
-TOTAL       :     1.204698 sec
+TOTAL       :     1.212857 sec
 INFO: No Floating Point Exceptions have been reported
-     3,643,752,944      cycles                           #    3.013 GHz                    
-     9,596,045,630      instructions                     #    2.63  insn per cycle         
-       1.210140981 seconds time elapsed
+     3,671,434,294      cycles                           #    3.013 GHz                    
+     9,632,126,320      instructions                     #    2.62  insn per cycle         
+       1.219246655 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:  462) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/runTest_cpp.exe
@@ -111,15 +111,15 @@ Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inline
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.299837e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.484212e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.484212e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.313604e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.570590e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.570590e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486735e-01 +- 3.293563e-05 )  GeV^0
-TOTAL       :     0.542592 sec
+TOTAL       :     0.557914 sec
 INFO: No Floating Point Exceptions have been reported
-     1,633,403,655      cycles                           #    2.984 GHz                    
-     3,962,789,991      instructions                     #    2.43  insn per cycle         
-       0.548005774 seconds time elapsed
+     1,698,515,028      cycles                           #    3.014 GHz                    
+     3,997,527,782      instructions                     #    2.35  insn per cycle         
+       0.564171143 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4: 1578) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/runTest_cpp.exe
@@ -140,15 +140,15 @@ Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inline
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.139117e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.581885e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.581885e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.069297e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.474961e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.474961e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486735e-01 +- 3.293562e-05 )  GeV^0
-TOTAL       :     0.419606 sec
+TOTAL       :     0.435063 sec
 INFO: No Floating Point Exceptions have been reported
-     1,254,162,397      cycles                           #    2.954 GHz                    
-     2,493,681,375      instructions                     #    1.99  insn per cycle         
-       0.425051478 seconds time elapsed
+     1,286,599,575      cycles                           #    2.919 GHz                    
+     2,528,332,939      instructions                     #    1.97  insn per cycle         
+       0.441354656 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1910) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/runTest_cpp.exe
@@ -169,15 +169,15 @@ Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inline
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.138494e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.665958e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.665958e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.180191e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.819453e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.819453e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486735e-01 +- 3.293562e-05 )  GeV^0
-TOTAL       :     0.421327 sec
+TOTAL       :     0.425326 sec
 INFO: No Floating Point Exceptions have been reported
-     1,237,307,699      cycles                           #    2.903 GHz                    
-     2,467,612,553      instructions                     #    1.99  insn per cycle         
-       0.426840225 seconds time elapsed
+     1,261,525,072      cycles                           #    2.926 GHz                    
+     2,504,983,030      instructions                     #    1.99  insn per cycle         
+       0.431704777 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1855) (512y:    1) (512z:    0)
 -------------------------------------------------------------------------
 runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/runTest_cpp.exe
@@ -198,15 +198,15 @@ Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inline
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.027449e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.114286e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.114286e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.850782e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.787254e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.787254e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486735e-01 +- 3.293561e-05 )  GeV^0
-TOTAL       :     0.433178 sec
+TOTAL       :     0.464725 sec
 INFO: No Floating Point Exceptions have been reported
-     1,076,863,966      cycles                           #    2.460 GHz                    
-     2,071,125,855      instructions                     #    1.92  insn per cycle         
-       0.438611419 seconds time elapsed
+     1,108,955,129      cycles                           #    2.357 GHz                    
+     2,107,952,878      instructions                     #    1.90  insn per cycle         
+       0.471172185 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1039) (512y:    5) (512z: 1290)
 -------------------------------------------------------------------------
 runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/runTest_cpp.exe
diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt
index 4760459d1f..24f2cc254b 100644
--- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
-DATE: 2024-07-23_17:02:04
+DATE: 2024-08-08_20:38:01
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.067489e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.259235e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.713015e+09                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.481519e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.098490e+09                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.734508e+09                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486732e-01 +- 3.293572e-05 )  GeV^0
-TOTAL       :     0.484584 sec
+TOTAL       :     0.480270 sec
 INFO: No Floating Point Exceptions have been reported
-     2,081,403,027      cycles                           #    2.924 GHz                    
-     2,972,594,980      instructions                     #    1.43  insn per cycle         
-       0.768624569 seconds time elapsed
+     2,041,258,883      cycles                           #    2.865 GHz                    
+     2,919,368,257      instructions                     #    1.43  insn per cycle         
+       0.770727877 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 93
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -82,15 +82,15 @@ Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inline
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.423311e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.085961e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.085961e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.423477e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.084213e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.084213e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486735e-01 +- 3.293563e-05 )  GeV^0
-TOTAL       :     1.198062 sec
+TOTAL       :     1.208276 sec
 INFO: No Floating Point Exceptions have been reported
-     3,620,156,629      cycles                           #    3.009 GHz                    
-     9,465,393,926      instructions                     #    2.61  insn per cycle         
-       1.203587670 seconds time elapsed
+     3,647,443,455      cycles                           #    3.005 GHz                    
+     9,504,212,055      instructions                     #    2.61  insn per cycle         
+       1.214581993 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:  366) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/runTest_cpp.exe
@@ -111,15 +111,15 @@ Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inline
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.238505e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.360648e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.360648e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.204450e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.296384e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.296384e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486735e-01 +- 3.293563e-05 )  GeV^0
-TOTAL       :     0.554290 sec
+TOTAL       :     0.572123 sec
 INFO: No Floating Point Exceptions have been reported
-     1,633,916,845      cycles                           #    2.922 GHz                    
-     3,928,828,269      instructions                     #    2.40  insn per cycle         
-       0.559791116 seconds time elapsed
+     1,666,311,430      cycles                           #    2.883 GHz                    
+     3,968,199,942      instructions                     #    2.38  insn per cycle         
+       0.578517715 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4: 1516) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/runTest_cpp.exe
@@ -140,15 +140,15 @@ Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inline
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.121156e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.545020e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.545020e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.086457e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.476966e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.476966e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486735e-01 +- 3.293562e-05 )  GeV^0
-TOTAL       :     0.421088 sec
+TOTAL       :     0.433372 sec
 INFO: No Floating Point Exceptions have been reported
-     1,254,249,816      cycles                           #    2.942 GHz                    
-     2,477,993,318      instructions                     #    1.98  insn per cycle         
-       0.426883352 seconds time elapsed
+     1,287,648,503      cycles                           #    2.933 GHz                    
+     2,519,527,968      instructions                     #    1.96  insn per cycle         
+       0.439715000 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1801) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/runTest_cpp.exe
@@ -169,15 +169,15 @@ Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inline
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.213732e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.821522e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.821522e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.137610e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.760529e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.760529e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486735e-01 +- 3.293562e-05 )  GeV^0
-TOTAL       :     0.410608 sec
+TOTAL       :     0.429722 sec
 INFO: No Floating Point Exceptions have been reported
-     1,222,985,428      cycles                           #    2.944 GHz                    
-     2,454,854,040      instructions                     #    2.01  insn per cycle         
-       0.416044203 seconds time elapsed
+     1,269,495,412      cycles                           #    2.915 GHz                    
+     2,496,260,070      instructions                     #    1.97  insn per cycle         
+       0.436264737 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1764) (512y:    1) (512z:    0)
 -------------------------------------------------------------------------
 runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/runTest_cpp.exe
@@ -198,15 +198,15 @@ Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inline
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.076410e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.296730e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.296730e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.044380e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.291761e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.291761e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486735e-01 +- 3.293561e-05 )  GeV^0
-TOTAL       :     0.427614 sec
+TOTAL       :     0.438334 sec
 INFO: No Floating Point Exceptions have been reported
-     1,070,051,378      cycles                           #    2.475 GHz                    
-     2,054,834,381      instructions                     #    1.92  insn per cycle         
-       0.433034280 seconds time elapsed
+     1,106,020,121      cycles                           #    2.491 GHz                    
+     2,096,224,924      instructions                     #    1.90  insn per cycle         
+       0.444840756 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  934) (512y:    5) (512z: 1271)
 -------------------------------------------------------------------------
 runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/runTest_cpp.exe
diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt
index bd3bc8d8f9..097ec6962d 100644
--- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
-DATE: 2024-07-23_17:02:15
+DATE: 2024-08-08_20:38:12
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.430459e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.319984e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.394007e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.657009e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.040901e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.368076e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.521551 sec
+TOTAL       :     0.510823 sec
 INFO: No Floating Point Exceptions have been reported
-     2,263,626,755      cycles                           #    2.970 GHz                    
-     3,140,513,974      instructions                     #    1.39  insn per cycle         
-       0.819288000 seconds time elapsed
+     2,202,406,007      cycles                           #    2.933 GHz                    
+     3,131,483,968      instructions                     #    1.42  insn per cycle         
+       0.809574698 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 132
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -82,15 +82,15 @@ Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inline
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.284880e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.058338e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.058338e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.987871e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.027797e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.027797e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     1.239757 sec
+TOTAL       :     1.312691 sec
 INFO: No Floating Point Exceptions have been reported
-     3,778,922,988      cycles                           #    3.036 GHz                    
-     9,745,451,778      instructions                     #    2.58  insn per cycle         
-       1.245537909 seconds time elapsed
+     3,886,479,162      cycles                           #    2.942 GHz                    
+     9,876,785,784      instructions                     #    2.54  insn per cycle         
+       1.321966236 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:  338) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/runTest_cpp.exe
@@ -111,15 +111,15 @@ Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inline
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.609344e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.086524e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.086524e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.603482e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.083956e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.083956e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.762267 sec
+TOTAL       :     0.795166 sec
 INFO: No Floating Point Exceptions have been reported
-     2,282,992,204      cycles                           #    2.975 GHz                    
-     5,912,624,923      instructions                     #    2.59  insn per cycle         
-       0.767997063 seconds time elapsed
+     2,395,751,097      cycles                           #    2.981 GHz                    
+     6,041,369,753      instructions                     #    2.52  insn per cycle         
+       0.804292816 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4: 1409) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/runTest_cpp.exe
@@ -140,15 +140,15 @@ Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inline
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.266801e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.350656e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.350656e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.333538e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.457835e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.457835e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.572838 sec
+TOTAL       :     0.593950 sec
 INFO: No Floating Point Exceptions have been reported
-     1,637,126,786      cycles                           #    2.833 GHz                    
-     3,250,368,511      instructions                     #    1.99  insn per cycle         
-       0.578515372 seconds time elapsed
+     1,751,397,279      cycles                           #    2.907 GHz                    
+     3,381,419,349      instructions                     #    1.93  insn per cycle         
+       0.603155882 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1555) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/runTest_cpp.exe
@@ -169,15 +169,15 @@ Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inline
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.415598e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.643733e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.643733e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.383716e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.579987e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.579987e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.544901 sec
+TOTAL       :     0.584649 sec
 INFO: No Floating Point Exceptions have been reported
-     1,603,336,377      cycles                           #    2.915 GHz                    
-     3,205,971,638      instructions                     #    2.00  insn per cycle         
-       0.550693078 seconds time elapsed
+     1,722,820,866      cycles                           #    2.904 GHz                    
+     3,335,061,421      instructions                     #    1.94  insn per cycle         
+       0.593900292 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1434) (512y:  101) (512z:    0)
 -------------------------------------------------------------------------
 runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/runTest_cpp.exe
@@ -198,15 +198,15 @@ Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inline
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.250649e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.279035e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.279035e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.223321e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.217067e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.217067e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.576205 sec
+TOTAL       :     0.618111 sec
 INFO: No Floating Point Exceptions have been reported
-     1,354,591,728      cycles                           #    2.330 GHz                    
-     2,373,898,744      instructions                     #    1.75  insn per cycle         
-       0.581963295 seconds time elapsed
+     1,474,024,650      cycles                           #    2.351 GHz                    
+     2,505,057,782      instructions                     #    1.70  insn per cycle         
+       0.627415589 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  744) (512y:   64) (512z: 1062)
 -------------------------------------------------------------------------
 runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/runTest_cpp.exe
diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt
index b203416aeb..909ea75534 100644
--- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
-DATE: 2024-07-23_17:02:27
+DATE: 2024-08-08_20:38:24
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.443321e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.735482e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.752926e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.791313e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.626392e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.791667e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.524557 sec
+TOTAL       :     0.506993 sec
 INFO: No Floating Point Exceptions have been reported
-     2,266,586,877      cycles                           #    2.953 GHz                    
-     3,158,332,812      instructions                     #    1.39  insn per cycle         
-       0.824937952 seconds time elapsed
+     2,160,282,873      cycles                           #    2.928 GHz                    
+     3,104,863,193      instructions                     #    1.44  insn per cycle         
+       0.795042821 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 124
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
@@ -82,15 +82,15 @@ Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inline
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.343960e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.065936e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.065936e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.274915e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.058342e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.058342e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     1.230924 sec
+TOTAL       :     1.272460 sec
 INFO: No Floating Point Exceptions have been reported
-     3,759,605,042      cycles                           #    3.042 GHz                    
-     9,636,433,569      instructions                     #    2.56  insn per cycle         
-       1.236672615 seconds time elapsed
+     3,870,727,422      cycles                           #    3.021 GHz                    
+     9,766,927,758      instructions                     #    2.52  insn per cycle         
+       1.281884523 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:  356) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/runTest_cpp.exe
@@ -111,15 +111,15 @@ Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inline
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.591612e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.060590e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.060590e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.623095e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.126207e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.126207e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.770075 sec
+TOTAL       :     0.787281 sec
 INFO: No Floating Point Exceptions have been reported
-     2,304,102,028      cycles                           #    2.973 GHz                    
-     5,854,779,970      instructions                     #    2.54  insn per cycle         
-       0.775710054 seconds time elapsed
+     2,408,985,457      cycles                           #    3.026 GHz                    
+     5,983,716,153      instructions                     #    2.48  insn per cycle         
+       0.796654714 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4: 1367) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/runTest_cpp.exe
@@ -140,15 +140,15 @@ Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inline
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.314183e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.423375e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.423375e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.282374e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.352435e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.352435e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.562294 sec
+TOTAL       :     0.601451 sec
 INFO: No Floating Point Exceptions have been reported
-     1,652,087,916      cycles                           #    2.912 GHz                    
-     3,213,928,099      instructions                     #    1.95  insn per cycle         
-       0.567903877 seconds time elapsed
+     1,779,110,472      cycles                           #    2.917 GHz                    
+     3,343,155,447      instructions                     #    1.88  insn per cycle         
+       0.610581817 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1471) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
 runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/runTest_cpp.exe
@@ -169,15 +169,15 @@ Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inline
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.431496e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.634136e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.634136e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.404645e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.636849e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.636849e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.539794 sec
+TOTAL       :     0.577304 sec
 INFO: No Floating Point Exceptions have been reported
-     1,597,382,114      cycles                           #    2.933 GHz                    
-     3,178,138,631      instructions                     #    1.99  insn per cycle         
-       0.545290348 seconds time elapsed
+     1,713,534,680      cycles                           #    2.924 GHz                    
+     3,304,839,422      instructions                     #    1.93  insn per cycle         
+       0.586559957 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1370) (512y:  101) (512z:    0)
 -------------------------------------------------------------------------
 runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/runTest_cpp.exe
@@ -198,15 +198,15 @@ Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inline
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.311837e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.372038e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.372038e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.274336e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.329961e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.329961e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.562137 sec
+TOTAL       :     0.603476 sec
 INFO: No Floating Point Exceptions have been reported
-     1,361,155,598      cycles                           #    2.400 GHz                    
-     2,358,011,111      instructions                     #    1.73  insn per cycle         
-       0.567788379 seconds time elapsed
+     1,481,795,981      cycles                           #    2.421 GHz                    
+     2,484,912,045      instructions                     #    1.68  insn per cycle         
+       0.612779368 seconds time elapsed
 =Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  692) (512y:   64) (512z: 1053)
 -------------------------------------------------------------------------
 runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/runTest_cpp.exe
diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt
index 4b28d920f4..23a45578df 100644
--- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-06-28_21:07:37
+DATE: 2024-08-08_20:35:05
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.987516e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.185615e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.286924e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.006324e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.190183e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.288100e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     0.528112 sec
+TOTAL       :     0.519336 sec
 INFO: No Floating Point Exceptions have been reported
-     2,142,646,791      cycles                           #    2.810 GHz                    
-     3,068,281,279      instructions                     #    1.43  insn per cycle         
-       0.820901730 seconds time elapsed
+     2,213,490,510      cycles                           #    2.944 GHz                    
+     3,142,609,105      instructions                     #    1.42  insn per cycle         
+       0.808787239 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHe
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.894025e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.942883e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.942883e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.848625e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.896982e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.896982e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     5.643752 sec
+TOTAL       :     5.805390 sec
 INFO: No Floating Point Exceptions have been reported
-    17,226,404,935      cycles                           #    3.050 GHz                    
-    45,931,312,380      instructions                     #    2.67  insn per cycle         
-       5.649207552 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  636) (avx2:    0) (512y:    0) (512z:    0)
+    17,322,328,356      cycles                           #    2.980 GHz                    
+    46,027,314,744      instructions                     #    2.66  insn per cycle         
+       5.814672958 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  623) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHe
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.306536e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.469427e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.469427e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.232999e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.394305e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.394305e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     3.278748 sec
+TOTAL       :     3.377455 sec
 INFO: No Floating Point Exceptions have been reported
-     9,984,585,810      cycles                           #    3.041 GHz                    
-    27,808,479,846      instructions                     #    2.79  insn per cycle         
-       3.284501363 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2548) (avx2:    0) (512y:    0) (512z:    0)
+    10,089,219,468      cycles                           #    2.980 GHz                    
+    27,901,985,402      instructions                     #    2.77  insn per cycle         
+       3.386689562 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2536) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHe
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.219913e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.629742e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.629742e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.131636e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.534601e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.534601e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     2.116298 sec
+TOTAL       :     2.174966 sec
 INFO: No Floating Point Exceptions have been reported
-     6,086,077,866      cycles                           #    2.869 GHz                    
-    12,591,046,550      instructions                     #    2.07  insn per cycle         
-       2.121982392 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2696) (512y:    0) (512z:    0)
+     6,180,272,446      cycles                           #    2.831 GHz                    
+    12,679,670,239      instructions                     #    2.05  insn per cycle         
+       2.183950081 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2613) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHe
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.636641e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.104777e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.104777e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.604193e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.099182e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.099182e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     1.966039 sec
+TOTAL       :     2.003125 sec
 INFO: No Floating Point Exceptions have been reported
-     5,583,643,425      cycles                           #    2.833 GHz                    
-    12,005,695,706      instructions                     #    2.15  insn per cycle         
-       1.971558570 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2444) (512y:  144) (512z:    0)
+     5,696,944,820      cycles                           #    2.832 GHz                    
+    12,097,133,291      instructions                     #    2.12  insn per cycle         
+       2.012150160 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2356) (512y:  144) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHe
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.702611e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.899100e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.899100e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.648289e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.842846e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.842846e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     2.939991 sec
+TOTAL       :     3.006654 sec
 INFO: No Floating Point Exceptions have been reported
-     5,757,651,640      cycles                           #    1.955 GHz                    
-     8,345,980,239      instructions                     #    1.45  insn per cycle         
-       2.945651612 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1486) (512y:  122) (512z: 1805)
+     5,848,300,882      cycles                           #    1.940 GHz                    
+     8,438,808,313      instructions                     #    1.44  insn per cycle         
+       3.015775673 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1456) (512y:  122) (512z: 1805)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt
index 474b872b4a..084acffe25 100644
--- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-06-28_21:08:01
+DATE: 2024-08-08_20:35:30
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.968225e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.178768e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.279997e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.973192e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.180411e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.278662e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     0.518790 sec
+TOTAL       :     0.518873 sec
 INFO: No Floating Point Exceptions have been reported
-     2,218,888,564      cycles                           #    2.961 GHz                    
-     3,191,991,067      instructions                     #    1.44  insn per cycle         
-       0.806488977 seconds time elapsed
+     2,217,952,324      cycles                           #    2.952 GHz                    
+     3,211,075,681      instructions                     #    1.45  insn per cycle         
+       0.807521486 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHe
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.948594e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.001763e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.001763e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.919771e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.971109e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.971109e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     5.490922 sec
+TOTAL       :     5.589458 sec
 INFO: No Floating Point Exceptions have been reported
-    16,707,703,822      cycles                           #    3.040 GHz                    
-    44,917,267,122      instructions                     #    2.69  insn per cycle         
-       5.496949602 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  580) (avx2:    0) (512y:    0) (512z:    0)
+    16,851,504,003      cycles                           #    3.011 GHz                    
+    45,007,980,146      instructions                     #    2.67  insn per cycle         
+       5.597787166 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  567) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHe
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.418137e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.601353e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.601353e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.433331e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.615119e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.615119e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     3.176327 sec
+TOTAL       :     3.183428 sec
 INFO: No Floating Point Exceptions have been reported
-     9,532,841,539      cycles                           #    2.997 GHz                    
-    26,690,753,956      instructions                     #    2.80  insn per cycle         
-       3.181964222 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2342) (avx2:    0) (512y:    0) (512z:    0)
+     9,605,830,601      cycles                           #    3.010 GHz                    
+    26,781,992,422      instructions                     #    2.79  insn per cycle         
+       3.191879831 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2330) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHe
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.794970e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.133255e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.133255e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.719654e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.056760e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.056760e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     2.295112 sec
+TOTAL       :     2.350234 sec
 INFO: No Floating Point Exceptions have been reported
-     6,599,473,152      cycles                           #    2.870 GHz                    
-    14,114,597,861      instructions                     #    2.14  insn per cycle         
-       2.300710249 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2780) (512y:    0) (512z:    0)
+     6,680,473,802      cycles                           #    2.833 GHz                    
+    14,206,471,082      instructions                     #    2.13  insn per cycle         
+       2.358807267 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2697) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHe
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.970311e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.333858e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.333858e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.858381e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.210770e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.210770e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     2.216523 sec
+TOTAL       :     2.286934 sec
 INFO: No Floating Point Exceptions have been reported
-     6,338,453,582      cycles                           #    2.854 GHz                    
-    13,709,424,623      instructions                     #    2.16  insn per cycle         
-       2.222146521 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2436) (512y:  297) (512z:    0)
+     6,467,572,645      cycles                           #    2.819 GHz                    
+    13,805,117,271      instructions                     #    2.13  insn per cycle         
+       2.295500484 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2348) (512y:  297) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHe
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.500387e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.675839e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.675839e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.556078e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.738376e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.738376e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     3.103428 sec
+TOTAL       :     3.078127 sec
 INFO: No Floating Point Exceptions have been reported
-     5,945,105,969      cycles                           #    1.913 GHz                    
-    10,105,639,078      instructions                     #    1.70  insn per cycle         
-       3.108831696 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1336) (512y:  208) (512z: 1985)
+     6,022,357,803      cycles                           #    1.952 GHz                    
+    10,198,455,945      instructions                     #    1.69  insn per cycle         
+       3.086650563 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1306) (512y:  208) (512z: 1985)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt
index a66d6683a6..3eab9e9753 100644
--- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-06-28_21:08:25
+DATE: 2024-08-08_20:35:54
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.559482e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.212410e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.403401e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.671843e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.219611e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.398007e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072877e+00 +- 3.361153e-03 )  GeV^0
-TOTAL       :     0.478047 sec
+TOTAL       :     0.483015 sec
 INFO: No Floating Point Exceptions have been reported
-     2,106,213,142      cycles                           #    2.975 GHz                    
-     3,003,323,992      instructions                     #    1.43  insn per cycle         
-       0.765175289 seconds time elapsed
+     2,057,665,691      cycles                           #    2.919 GHz                    
+     2,974,139,215      instructions                     #    1.45  insn per cycle         
+       0.763755746 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 149
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHe
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.002790e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.059233e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.059233e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.976573e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.032296e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.032296e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072937e+00 +- 3.361545e-03 )  GeV^0
-TOTAL       :     5.322844 sec
+TOTAL       :     5.392550 sec
 INFO: No Floating Point Exceptions have been reported
-    16,223,048,121      cycles                           #    3.046 GHz                    
-    45,327,540,091      instructions                     #    2.79  insn per cycle         
-       5.327983233 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  614) (avx2:    0) (512y:    0) (512z:    0)
+    16,223,721,004      cycles                           #    3.006 GHz                    
+    45,343,520,122      instructions                     #    2.79  insn per cycle         
+       5.398630583 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  601) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHe
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.698933e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.051306e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.051306e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.606915e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.959618e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.959618e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072937e+00 +- 3.361544e-03 )  GeV^0
-TOTAL       :     2.317323 sec
+TOTAL       :     2.365944 sec
 INFO: No Floating Point Exceptions have been reported
-     7,057,769,754      cycles                           #    3.040 GHz                    
-    17,776,370,522      instructions                     #    2.52  insn per cycle         
-       2.322507956 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3154) (avx2:    0) (512y:    0) (512z:    0)
+     7,142,483,054      cycles                           #    3.012 GHz                    
+    17,793,150,450      instructions                     #    2.49  insn per cycle         
+       2.371767516 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3136) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHe
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 8.614657e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.810162e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.810162e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.534145e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.726326e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.726326e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072967e+00 +- 3.361967e-03 )  GeV^0
-TOTAL       :     1.301003 sec
+TOTAL       :     1.317221 sec
 INFO: No Floating Point Exceptions have been reported
-     3,749,560,002      cycles                           #    2.872 GHz                    
-     8,268,310,663      instructions                     #    2.21  insn per cycle         
-       1.306245545 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3379) (512y:    0) (512z:    0)
+     3,766,549,622      cycles                           #    2.849 GHz                    
+     8,281,231,591      instructions                     #    2.20  insn per cycle         
+       1.323030863 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3355) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHe
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 9.173728e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.054118e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.054118e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.037857e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.038500e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.038500e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072967e+00 +- 3.361967e-03 )  GeV^0
-TOTAL       :     1.229272 sec
+TOTAL       :     1.247672 sec
 INFO: No Floating Point Exceptions have been reported
-     3,562,944,302      cycles                           #    2.888 GHz                    
-     7,923,374,917      instructions                     #    2.22  insn per cycle         
-       1.234483154 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3231) (512y:   20) (512z:    0)
+     3,572,380,687      cycles                           #    2.852 GHz                    
+     7,938,220,748      instructions                     #    2.22  insn per cycle         
+       1.253461191 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3201) (512y:   20) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHe
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.833328e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.525856e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.525856e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.780907e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.464899e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.464899e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072967e+00 +- 3.361967e-03 )  GeV^0
-TOTAL       :     1.620133 sec
+TOTAL       :     1.635161 sec
 INFO: No Floating Point Exceptions have been reported
-     3,261,172,355      cycles                           #    2.008 GHz                    
-     6,104,371,418      instructions                     #    1.87  insn per cycle         
-       1.625447082 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2407) (512y:   24) (512z: 2153)
+     3,277,760,479      cycles                           #    1.999 GHz                    
+     6,118,650,971      instructions                     #    1.87  insn per cycle         
+       1.640889669 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2294) (512y:   24) (512z: 2154)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt
index 01cbb3ba00..95f2f81a67 100644
--- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-06-28_21:08:45
+DATE: 2024-08-08_20:36:15
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.977153e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.485991e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.727696e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.014048e+08                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.487826e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.715050e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072877e+00 +- 3.361153e-03 )  GeV^0
-TOTAL       :     0.479625 sec
+TOTAL       :     0.479773 sec
 INFO: No Floating Point Exceptions have been reported
-     2,087,735,706      cycles                           #    2.950 GHz                    
-     2,974,984,458      instructions                     #    1.42  insn per cycle         
-       0.766158623 seconds time elapsed
+     2,021,404,320      cycles                           #    2.871 GHz                    
+     2,909,718,804      instructions                     #    1.44  insn per cycle         
+       0.763747586 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHe
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 2.032109e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.091836e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.091836e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.015289e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.073220e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.073220e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072937e+00 +- 3.361545e-03 )  GeV^0
-TOTAL       :     5.260965 sec
+TOTAL       :     5.290195 sec
 INFO: No Floating Point Exceptions have been reported
-    15,970,657,452      cycles                           #    3.043 GHz                    
-    44,436,012,793      instructions                     #    2.78  insn per cycle         
-       5.266053329 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  547) (avx2:    0) (512y:    0) (512z:    0)
+    15,992,452,194      cycles                           #    3.020 GHz                    
+    44,447,001,670      instructions                     #    2.78  insn per cycle         
+       5.296101650 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  534) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHe
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.469324e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.951349e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.951349e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.486417e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.979858e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.979858e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072937e+00 +- 3.361544e-03 )  GeV^0
-TOTAL       :     2.003414 sec
+TOTAL       :     2.001515 sec
 INFO: No Floating Point Exceptions have been reported
-     6,069,324,654      cycles                           #    3.023 GHz                    
-    17,077,758,378      instructions                     #    2.81  insn per cycle         
-       2.008459288 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2881) (avx2:    0) (512y:    0) (512z:    0)
+     6,083,399,365      cycles                           #    3.032 GHz                    
+    17,096,762,778      instructions                     #    2.81  insn per cycle         
+       2.007478242 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2863) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHe
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.186571e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.792405e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.792405e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.273384e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.901765e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.901765e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072967e+00 +- 3.361967e-03 )  GeV^0
-TOTAL       :     1.779064 sec
+TOTAL       :     1.760820 sec
 INFO: No Floating Point Exceptions have been reported
-     5,029,323,256      cycles                           #    2.820 GHz                    
-    10,228,128,275      instructions                     #    2.03  insn per cycle         
-       1.784210959 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3916) (512y:    0) (512z:    0)
+     5,038,046,690      cycles                           #    2.853 GHz                    
+    10,244,068,560      instructions                     #    2.03  insn per cycle         
+       1.766743334 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3892) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHe
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 6.269492e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.881858e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.881858e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.352422e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.995021e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.995021e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072967e+00 +- 3.361967e-03 )  GeV^0
-TOTAL       :     1.756710 sec
+TOTAL       :     1.739024 sec
 INFO: No Floating Point Exceptions have been reported
-     4,975,750,592      cycles                           #    2.825 GHz                    
-     9,998,359,521      instructions                     #    2.01  insn per cycle         
-       1.761821685 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3823) (512y:    2) (512z:    0)
+     4,995,379,501      cycles                           #    2.864 GHz                    
+    10,014,742,907      instructions                     #    2.00  insn per cycle         
+       1.744931983 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3793) (512y:    2) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHe
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.927584e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.287376e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.287376e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.909740e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.260066e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.260066e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072967e+00 +- 3.361967e-03 )  GeV^0
-TOTAL       :     2.212094 sec
+TOTAL       :     2.224170 sec
 INFO: No Floating Point Exceptions have been reported
-     4,375,301,070      cycles                           #    1.974 GHz                    
-     8,447,130,711      instructions                     #    1.93  insn per cycle         
-       2.217463579 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2895) (512y:    4) (512z: 2751)
+     4,384,022,767      cycles                           #    1.967 GHz                    
+     8,465,829,971      instructions                     #    1.93  insn per cycle         
+       2.230123024 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2782) (512y:    4) (512z: 2752)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt
index 52252cbfc7..3f2b21ab02 100644
--- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-06-28_21:09:06
+DATE: 2024-08-08_20:36:36
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.991553e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.180542e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.281710e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.111342e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.183781e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.280569e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     0.521706 sec
+TOTAL       :     0.516736 sec
 INFO: No Floating Point Exceptions have been reported
-     2,224,165,682      cycles                           #    2.953 GHz                    
-     3,203,188,534      instructions                     #    1.44  insn per cycle         
-       0.810337574 seconds time elapsed
+     2,204,839,521      cycles                           #    2.950 GHz                    
+     3,193,475,947      instructions                     #    1.45  insn per cycle         
+       0.804039579 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHe
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.871017e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.918846e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.918846e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.851387e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.898716e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.898716e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     5.711061 sec
+TOTAL       :     5.792449 sec
 INFO: No Floating Point Exceptions have been reported
-    17,404,987,337      cycles                           #    3.045 GHz                    
-    46,083,110,316      instructions                     #    2.65  insn per cycle         
-       5.716653594 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  636) (avx2:    0) (512y:    0) (512z:    0)
+    17,478,048,232      cycles                           #    3.014 GHz                    
+    46,175,878,133      instructions                     #    2.64  insn per cycle         
+       5.800949907 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  623) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHe
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.312498e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.481884e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.481884e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.302826e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.471365e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.471365e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     3.272999 sec
+TOTAL       :     3.305610 sec
 INFO: No Floating Point Exceptions have been reported
-     9,986,410,862      cycles                           #    3.047 GHz                    
-    27,601,244,510      instructions                     #    2.76  insn per cycle         
-       3.278531877 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2593) (avx2:    0) (512y:    0) (512z:    0)
+    10,029,884,170      cycles                           #    3.027 GHz                    
+    27,698,012,954      instructions                     #    2.76  insn per cycle         
+       3.314264877 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2581) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHe
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.173705e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.573635e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.573635e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.212203e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.631040e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.631040e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     2.134679 sec
+TOTAL       :     2.141280 sec
 INFO: No Floating Point Exceptions have been reported
-     6,031,616,440      cycles                           #    2.820 GHz                    
-    12,494,854,133      instructions                     #    2.07  insn per cycle         
-       2.140101571 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2783) (512y:    0) (512z:    0)
+     6,126,755,092      cycles                           #    2.851 GHz                    
+    12,585,784,837      instructions                     #    2.05  insn per cycle         
+       2.149799113 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2765) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHe
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 5.444634e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.896261e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.896261e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.714807e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.220314e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.220314e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     2.032418 sec
+TOTAL       :     1.966130 sec
 INFO: No Floating Point Exceptions have been reported
-     5,512,811,090      cycles                           #    2.706 GHz                    
-    11,931,754,486      instructions                     #    2.16  insn per cycle         
-       2.038065532 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2534) (512y:  146) (512z:    0)
+     5,614,473,659      cycles                           #    2.844 GHz                    
+    12,019,662,665      instructions                     #    2.14  insn per cycle         
+       1.974902809 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2510) (512y:  146) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHe
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.605249e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.805954e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.805954e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.735274e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.937488e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.937488e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     3.017835 sec
+TOTAL       :     2.937106 sec
 INFO: No Floating Point Exceptions have been reported
-     5,646,843,336      cycles                           #    1.868 GHz                    
-     8,120,170,284      instructions                     #    1.44  insn per cycle         
-       3.023662601 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1668) (512y:  126) (512z: 1865)
+     5,684,383,017      cycles                           #    1.930 GHz                    
+     8,211,471,869      instructions                     #    1.44  insn per cycle         
+       2.945845267 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1646) (512y:  126) (512z: 1865)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt
index d52539d8e7..9ec77e6c2c 100644
--- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt
@@ -1,7 +1,7 @@
 
 Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
-OMPFLAGS=-fopenmp 
+OMPFLAGS=
 FPTYPE='d'
 HELINL='0'
 HRDCOD='0'
@@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp
 make[1]: Nothing to be done for 'all'.
 make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-06-28_21:09:31
+DATE: 2024-08-08_20:37:00
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
@@ -49,21 +49,23 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts:
 Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.896359e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.171747e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.273533e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.087294e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.176774e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.273815e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     0.523837 sec
+TOTAL       :     0.521745 sec
 INFO: No Floating Point Exceptions have been reported
-     2,210,261,649      cycles                           #    2.923 GHz                    
-     3,098,934,216      instructions                     #    1.40  insn per cycle         
-       0.814091711 seconds time elapsed
+     2,190,333,356      cycles                           #    2.907 GHz                    
+     3,117,272,451      instructions                     #    1.42  insn per cycle         
+       0.811246203 seconds time elapsed
 runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1
 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208
 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
@@ -80,20 +82,21 @@ Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHe
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 1.909485e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.960095e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.960095e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.899666e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.949679e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.949679e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     5.598303 sec
+TOTAL       :     5.649808 sec
 INFO: No Floating Point Exceptions have been reported
-    16,960,347,258      cycles                           #    3.027 GHz                    
-    45,101,575,966      instructions                     #    2.66  insn per cycle         
-       5.604097252 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  581) (avx2:    0) (512y:    0) (512z:    0)
+    17,042,397,704      cycles                           #    3.012 GHz                    
+    45,200,059,180      instructions                     #    2.65  insn per cycle         
+       5.658309716 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  568) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -108,20 +111,21 @@ Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHe
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.386759e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.560207e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.560207e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.442760e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.623868e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.623868e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     3.207946 sec
+TOTAL       :     3.175173 sec
 INFO: No Floating Point Exceptions have been reported
-     9,517,840,011      cycles                           #    2.994 GHz                    
-    26,245,772,623      instructions                     #    2.76  insn per cycle         
-       3.215418509 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2397) (avx2:    0) (512y:    0) (512z:    0)
+     9,616,707,948      cycles                           #    3.021 GHz                    
+    26,345,303,385      instructions                     #    2.74  insn per cycle         
+       3.183844820 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2385) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -136,20 +140,21 @@ Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHe
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.136448e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.430910e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.430910e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.409096e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.707370e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.707370e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     2.672758 sec
+TOTAL       :     2.509673 sec
 INFO: No Floating Point Exceptions have been reported
-     6,760,215,182      cycles                           #    2.667 GHz                    
-    14,040,901,808      instructions                     #    2.08  insn per cycle         
-       2.679767706 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2901) (512y:    0) (512z:    0)
+     6,823,505,729      cycles                           #    2.711 GHz                    
+    14,133,345,545      instructions                     #    2.07  insn per cycle         
+       2.518344311 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2883) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -164,20 +169,21 @@ Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHe
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 4.448322e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.786078e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.786078e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.915857e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.278986e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.278986e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     2.482950 sec
+TOTAL       :     2.261621 sec
 INFO: No Floating Point Exceptions have been reported
-     6,405,645,930      cycles                           #    2.679 GHz                    
-    13,525,136,384      instructions                     #    2.11  insn per cycle         
-       2.488598452 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2543) (512y:  302) (512z:    0)
+     6,478,665,786      cycles                           #    2.855 GHz                    
+    13,612,638,339      instructions                     #    2.10  insn per cycle         
+       2.270008014 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2519) (512y:  302) (512z:    0)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
@@ -192,20 +198,21 @@ Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHe
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-OMP threads / `nproc --all` = 1 / 4
-EvtsPerSec[Rmb+ME]     (23) = ( 3.416786e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.592951e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.592951e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.779798e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.989152e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.989152e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     3.180333 sec
+TOTAL       :     2.903794 sec
 INFO: No Floating Point Exceptions have been reported
-     5,593,468,380      cycles                           #    1.770 GHz                    
-     9,219,345,276      instructions                     #    1.65  insn per cycle         
-       3.186249051 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1453) (512y:  212) (512z: 2058)
+     5,684,727,855      cycles                           #    1.953 GHz                    
+     9,307,942,112      instructions                     #    1.64  insn per cycle         
+       2.912446958 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1431) (512y:  212) (512z: 2058)
 -------------------------------------------------------------------------
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe
+runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe
+INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
 [  PASSED  ] 3 tests.
+INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
diff --git a/tools/mg-clang-format/mg-clang-format b/tools/mg-clang-format/mg-clang-format
index 37ae30d79b..9f36462da3 100755
--- a/tools/mg-clang-format/mg-clang-format
+++ b/tools/mg-clang-format/mg-clang-format
@@ -15,22 +15,24 @@ else
 fi  
 ###echo clangVersion=$clangVersion
 
-if [ ${clangVersion} -ge 13 ]; then
+if [ ${clangVersion} -ge 13 ] && [ ${clangVersion} -le 15 ]; then
   exec clang-format "$@"
 else
   if [ ! -d /cvmfs/sft.cern.ch/lcg/releases/clang ]; then
-    echo "ERROR! clang-format version >= 13 is not installed and /cvmfs/sft.cern.ch/lcg/releases/clang is not reachable"
+    echo "ERROR! clang-format version >= 13 and <= 15 is not installed and /cvmfs/sft.cern.ch/lcg/releases/clang is not reachable"
     exit 1    
   fi
   redrel=$(cat /etc/redhat-release 2> /dev/null)
   if [ "${redrel##*release 7}" != "${redrel}" ]; then
-    clangDir=/cvmfs/sft.cern.ch/lcg/releases/clang/13.0.1-721c8/x86_64-centos7
+    ###clangDir=/cvmfs/sft.cern.ch/lcg/releases/clang/13.0.1-721c8/x86_64-centos7
+    clangDir=/cvmfs/sft.cern.ch/lcg/releases/clang/15.0.7-27d6b/x86_64-centos7
   elif [ "${redrel##*release 8}" != "${redrel}" ]; then
     clangDir=/cvmfs/sft.cern.ch/lcg/releases/clang/13.0.1-721c8/x86_64-centos8
   elif [ "${redrel##*release 9}" != "${redrel}" ]; then
-    clangDir=/cvmfs/sft.cern.ch/lcg/releases/clang/14.0.6-14bdb/x86_64-centos9
+    ###clangDir=/cvmfs/sft.cern.ch/lcg/releases/clang/14.0.6-14bdb/x86_64-centos9
+    clangDir=/cvmfs/sft.cern.ch/lcg/releases/clang/15.0.7-27d6b/x86_64-el9
   else
-    echo "ERROR! clang-format version >= 13 is not installed and RedHat release could not be identified (${redrel})"
+    echo "ERROR! clang-format version >= 13 and <= 15 is not installed and RedHat release could not be identified (${redrel})"
     exit 1
   fi
   source ${clangDir}/setup.sh