Merge pull request #1012 from valassi/amd

workaround for FPE in vxxxxx on HIP (and fixes for v1.00.01 tags)
madgraph5 · Oct 4, 2024 · 5d43720 · 5d43720
2 parents 7e8e033 + 0524cd1
commit 5d43720
Show file tree

Hide file tree

Showing 52 changed files with 310 additions and 211 deletions.
diff --git a/.github/workflows/archiver.sh b/.github/workflows/archiver.sh
@@ -26,8 +26,17 @@ mkdir ${outdir}
 outfile=${outdir}/VERSION.txt
 touch ${outfile}
 dateformat='%Y-%m-%d_%H:%M:%S UTC'
+cudacpp_major=$(cat __init__.py | grep __version__ | sed -r 's/(.*=|\(|\)|,)/ /g' | awk '{print $1}')
+cudacpp_minor=$(cat __init__.py | grep __version__ | sed -r 's/(.*=|\(|\)|,)/ /g' | awk '{print $2}')
+cudacpp_patch=$(cat __init__.py | grep __version__ | sed -r 's/(.*=|\(|\)|,)/ /g' | awk '{print $3}')
+###echo "(From CUDACPP_OUTPUT/__init__.py)"
+###echo "cudacpp (major, minor, patch) = ( ${cudacpp_major}, ${cudacpp_minor}, ${cudacpp_patch} )"
+if [ ${cudacpp_major} -lt 0 ] || [ ${cudacpp_major} -gt 99 ]; then echo "ERROR! cudacpp_major is not in the [0,99] range"; exit 1; fi
+if [ ${cudacpp_minor} -lt 0 ] || [ ${cudacpp_minor} -gt 99 ]; then echo "ERROR! cudacpp_minor is not in the [0,99] range"; exit 1; fi
+if [ ${cudacpp_patch} -lt 0 ] || [ ${cudacpp_patch} -gt 99 ]; then echo "ERROR! cudacpp_patch is not in the [0,99] range"; exit 1; fi
+cudacpp_version=$(printf "%1d.%02d.%02d" ${cudacpp_major} ${cudacpp_minor} ${cudacpp_patch})
 echo "(From CUDACPP_OUTPUT/__init__.py)" >> ${outfile}
-echo "cudacpp_version              = $(cat __init__.py | awk '/__version__/{print $3}' | sed 's/(//' | sed 's/)//' | sed 's/,/./g')" >> ${outfile}
+echo "cudacpp_version              = ${cudacpp_version}" >> ${outfile}
 echo "mg5_version_minimal          = $(cat __init__.py | awk '/minimal_mg5amcnlo_version/{print $3}'  | sed 's/(//' | sed 's/)//' | sed 's/,/./g')" >> ${outfile}
 echo "mg5_version_latest_validated = $(cat __init__.py | awk '/latest_validated_version/{print $3}'  | sed 's/(//' | sed 's/)//' | sed 's/,/./g')" >> ${outfile}
 echo "" >> ${outfile}

diff --git a/.github/workflows/archiver.yml b/.github/workflows/archiver.yml
@@ -13,8 +13,8 @@ on:
   push:
 
     tags:
-    # Include version tags such as 'cudacpp_for3.6.0_v1.0.0' or 'cudacpp_for3.6.0_v1.0.0_test001'
-    # Include version tags such as 'valassi_cudacpp_for3.6.0_v1.0.0' or 'valassi_cudacpp_for3.6.0_v1.0.0_test001'
+    # Include version tags such as 'cudacpp_for3.6.0_v1.00.00' or 'cudacpp_for3.6.0_v1.00.00_test001'
+    # Include version tags such as 'valassi_cudacpp_for3.6.0_v1.00.00' or 'valassi_cudacpp_for3.6.0_v1.00.00_test001'
     - '*cudacpp_for*_v*'
 
     # Exclude running tags such as 'cudacpp_for3.6.0_latest'

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/CHANGELOG.md b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/CHANGELOG.md
@@ -6,6 +6,22 @@ The format is loosely based on [Keep a Changelog](https://keepachangelog.com).
 
 --------------------------------------------------------------------------------
 
+## [Unreleased] - 2024-10-03
+
+### Changed
+
+- Updated cudacpp version to 1.00.01.
+
+### Fixed
+
+- Platform-specific issues
+  - AV ([#1011]) Added workaround for Floating Point Exceptions in vxxxxx in the HIP backend.
+
+- Infrastructure issues
+  - AV ([#1013]) Fix release scripts to create 'v1.00.01' tags from a '(1,0,1)' python tuple.
+
+--------------------------------------------------------------------------------
+
 ## [1.00.00] - 2024-10-03
 
 ### Added
@@ -35,6 +51,7 @@ The format is loosely based on [Keep a Changelog](https://keepachangelog.com).
 --------------------------------------------------------------------------------
 
 [1.00.00]: https://github.com/madgraph5/madgraph4gpu/releases/tag/cudacpp_for3.6.0_v1.00.00
+[Unreleased]: https://github.com/madgraph5/madgraph4gpu/releases/compare/cudacpp_for3.6.0_v1.00.00...HEAD
 
 [#601]: https://github.com/madgraph5/madgraph4gpu/issues/601
 [#846]: https://github.com/madgraph5/madgraph4gpu/issues/846
@@ -43,3 +60,4 @@ The format is loosely based on [Keep a Changelog](https://keepachangelog.com).
 [#959]: https://github.com/madgraph5/madgraph4gpu/issues/959
 [#993]: https://github.com/madgraph5/madgraph4gpu/issues/993
 [#1011]: https://github.com/madgraph5/madgraph4gpu/issues/1011
+[#1013]: https://github.com/madgraph5/madgraph4gpu/issues/1013
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/__init__.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/__init__.py
@@ -66,7 +66,11 @@
     __author__ = 'Andrea Valassi'
     __email__ = '[email protected]'
 
-    __version__ = (1,00,00) # NB the release infrastructure expects 1-digit major and 2-digit minor and patch versions (n,nn,nn)
+    # Plugin version (major,minor,patch) where major>1, 0<=minor<=99 and 0<=patch<=99
+    # The release infrastructure expects 'vN.NN.NN' tags with 1-digit major and 2-digit minor and patch versions
+    # and it takes care of converting the python tuple '(1,0,1)' into a version string 'v1.00.01'
+    # NB! Do not use '(1,00,01)' here: leading zeros in decimal integer literals are not permitted in python (#1013)
+    __version__ = (1,0,1)
 
     minimal_mg5amcnlo_version = (3,6,0)
     maximal_mg5amcnlo_version = (1000,1000,1000)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.h
@@ -451,7 +451,10 @@
       }
       else
       {
-        const fptype emp = pvec0 / ( vmass * pp );
+        //printf( "DEBUG1011 (before emp): pvec0=%f vmass=%f pp=%f vmass*pp=%f\n", pvec0, vmass, pp, vmass * pp );
+        //const fptype emp = pvec / ( vmass * pp ); // this may give a FPE #1011 (why?! maybe when vmass=+-epsilon?)
+        const fptype emp = pvec0 / vmass / pp; // workaround for FPE #1011
+        //printf( "DEBUG1011 (after emp): emp=%f\n", emp );
         vc[2] = cxmake( hel0 * pp / vmass, 0. );
         vc[5] = cxmake( hel0 * pvec3 * emp + hel * pt / pp * sqh, 0. );
         if( pt != 0. )

diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
@@ -58,7 +58,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.006308317184448242 [0m
+[1;32mDEBUG: model prefixing  takes 0.006434440612792969 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -150,7 +150,7 @@ INFO: Checking for minimal orders which gives processes.
 INFO: Please specify coupling orders to bypass this step. 
 INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1  
 INFO: Process has 2 diagrams 
-1 processes with 2 diagrams generated in 0.005 s
+1 processes with 2 diagrams generated in 0.004 s
 Total: 1 processes with 2 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
@@ -180,19 +180,19 @@ INFO: Finding symmetric diagrams for subprocess group epem_mupmum
 [1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1551][0m [0m
 [1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1552][0m [0m
 Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
-Wrote files for 8 helas calls in 0.070 s
+Wrote files for 8 helas calls in 0.069 s
 [1;32mDEBUG:  self.vector_size = [0m 32 [1;30m[export_v4.py at line 7023][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
-ALOHA: aloha creates 3 routines in  0.199 s
+ALOHA: aloha creates 3 routines in  0.201 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 7 routines in  0.252 s
+ALOHA: aloha creates 7 routines in  0.255 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -232,9 +232,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.042s
-user	0m1.792s
-sys	0m0.243s
+real	0m2.097s
+user	0m1.775s
+sys	0m0.272s
 Code generation completed in 2 seconds
 ************************************************************
 *                                                          *

diff --git a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
@@ -478,7 +478,10 @@ namespace mg5amcCpu
       }
       else
       {
-        const fptype emp = pvec0 / ( vmass * pp );
+        //printf( "DEBUG1011 (before emp): pvec0=%f vmass=%f pp=%f vmass*pp=%f\n", pvec0, vmass, pp, vmass * pp );
+        //const fptype emp = pvec / ( vmass * pp ); // this may give a FPE #1011 (why?! maybe when vmass=+-epsilon?)
+        const fptype emp = pvec0 / vmass / pp; // workaround for FPE #1011
+        //printf( "DEBUG1011 (after emp): emp=%f\n", emp );
         vc[2] = cxmake( hel0 * pp / vmass, 0. );
         vc[5] = cxmake( hel0 * pvec3 * emp + hel * pt / pp * sqh, 0. );
         if( pt != 0. )

diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
@@ -58,7 +58,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.006402015686035156 [0m
+[1;32mDEBUG: model prefixing  takes 0.0062215328216552734 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -150,7 +150,7 @@ INFO: Checking for minimal orders which gives processes.
 INFO: Please specify coupling orders to bypass this step. 
 INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1  
 INFO: Process has 2 diagrams 
-1 processes with 2 diagrams generated in 0.004 s
+1 processes with 2 diagrams generated in 0.005 s
 Total: 1 processes with 2 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_ee_mumu
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
@@ -169,13 +169,13 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM
 FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h
 FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. 
-Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
+Generated helas calls for 1 subprocesses (2 diagrams) in 0.003 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 4 routines in  0.265 s
+ALOHA: aloha creates 4 routines in  0.267 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -194,7 +194,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. 
 quit
 
-real	0m0.659s
-user	0m0.600s
-sys	0m0.042s
-Code generation completed in 0 seconds
+real	0m0.781s
+user	0m0.590s
+sys	0m0.053s
+Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h
@@ -478,7 +478,10 @@ namespace mg5amcCpu
       }
       else
       {
-        const fptype emp = pvec0 / ( vmass * pp );
+        //printf( "DEBUG1011 (before emp): pvec0=%f vmass=%f pp=%f vmass*pp=%f\n", pvec0, vmass, pp, vmass * pp );
+        //const fptype emp = pvec / ( vmass * pp ); // this may give a FPE #1011 (why?! maybe when vmass=+-epsilon?)
+        const fptype emp = pvec0 / vmass / pp; // workaround for FPE #1011
+        //printf( "DEBUG1011 (after emp): emp=%f\n", emp );
         vc[2] = cxmake( hel0 * pp / vmass, 0. );
         vc[5] = cxmake( hel0 * pvec3 * emp + hel * pt / pp * sqh, 0. );
         if( pt != 0. )

diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -58,7 +58,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.006418943405151367 [0m
+[1;32mDEBUG: model prefixing  takes 0.0059719085693359375 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -181,12 +181,12 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttx
 [1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1551][0m [0m
 [1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1552][0m [0m
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-Wrote files for 10 helas calls in 0.071 s
+Wrote files for 10 helas calls in 0.072 s
 [1;32mDEBUG:  self.vector_size = [0m 32 [1;30m[export_v4.py at line 7023][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.153 s
+ALOHA: aloha creates 2 routines in  0.150 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
@@ -226,9 +226,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.900s
-user	0m1.626s
-sys	0m0.264s
+real	0m1.997s
+user	0m1.613s
+sys	0m0.278s
 Code generation completed in 2 seconds
 ************************************************************
 *                                                          *

diff --git a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
@@ -478,7 +478,10 @@ namespace mg5amcCpu
       }
       else
       {
-        const fptype emp = pvec0 / ( vmass * pp );
+        //printf( "DEBUG1011 (before emp): pvec0=%f vmass=%f pp=%f vmass*pp=%f\n", pvec0, vmass, pp, vmass * pp );
+        //const fptype emp = pvec / ( vmass * pp ); // this may give a FPE #1011 (why?! maybe when vmass=+-epsilon?)
+        const fptype emp = pvec0 / vmass / pp; // workaround for FPE #1011
+        //printf( "DEBUG1011 (after emp): emp=%f\n", emp );
         vc[2] = cxmake( hel0 * pp / vmass, 0. );
         vc[5] = cxmake( hel0 * pvec3 * emp + hel * pt / pp * sqh, 0. );
         if( pt != 0. )

diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
@@ -58,7 +58,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.006226539611816406 [0m
+[1;32mDEBUG: model prefixing  takes 0.006254673004150391 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -174,7 +174,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.143 s
+ALOHA: aloha creates 2 routines in  0.144 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -189,7 +189,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
 quit
 
-real	0m0.559s
-user	0m0.480s
-sys	0m0.044s
+real	0m0.532s
+user	0m0.478s
+sys	0m0.045s
 Code generation completed in 0 seconds
diff --git a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h
@@ -478,7 +478,10 @@ namespace mg5amcCpu
       }
       else
       {
-        const fptype emp = pvec0 / ( vmass * pp );
+        //printf( "DEBUG1011 (before emp): pvec0=%f vmass=%f pp=%f vmass*pp=%f\n", pvec0, vmass, pp, vmass * pp );
+        //const fptype emp = pvec / ( vmass * pp ); // this may give a FPE #1011 (why?! maybe when vmass=+-epsilon?)
+        const fptype emp = pvec0 / vmass / pp; // workaround for FPE #1011
+        //printf( "DEBUG1011 (after emp): emp=%f\n", emp );
         vc[2] = cxmake( hel0 * pp / vmass, 0. );
         vc[5] = cxmake( hel0 * pvec3 * emp + hel * pt / pp * sqh, 0. );
         if( pt != 0. )

diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
@@ -58,7 +58,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.006343364715576172 [0m
+[1;32mDEBUG: model prefixing  takes 0.006289482116699219 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -159,7 +159,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.020 s
+1 processes with 16 diagrams generated in 0.019 s
 Total: 2 processes with 19 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
@@ -201,23 +201,23 @@ INFO: Finding symmetric diagrams for subprocess group gg_ttx
 [1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1527][0m [0m
 [1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1551][0m [0m
 [1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1552][0m [0m
-Generated helas calls for 2 subprocesses (19 diagrams) in 0.045 s
-Wrote files for 46 helas calls in 0.191 s
+Generated helas calls for 2 subprocesses (19 diagrams) in 0.042 s
+Wrote files for 46 helas calls in 0.189 s
 [1;32mDEBUG:  self.vector_size = [0m 32 [1;30m[export_v4.py at line 7023][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.326 s
+ALOHA: aloha creates 5 routines in  0.338 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.306 s
+ALOHA: aloha creates 10 routines in  0.311 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -265,10 +265,10 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.598s
-user	0m2.282s
-sys	0m0.314s
-Code generation completed in 3 seconds
+real	0m2.618s
+user	0m2.304s
+sys	0m0.310s
+Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *

diff --git a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h
@@ -478,7 +478,10 @@ namespace mg5amcCpu
       }
       else
       {
-        const fptype emp = pvec0 / ( vmass * pp );
+        //printf( "DEBUG1011 (before emp): pvec0=%f vmass=%f pp=%f vmass*pp=%f\n", pvec0, vmass, pp, vmass * pp );
+        //const fptype emp = pvec / ( vmass * pp ); // this may give a FPE #1011 (why?! maybe when vmass=+-epsilon?)
+        const fptype emp = pvec0 / vmass / pp; // workaround for FPE #1011
+        //printf( "DEBUG1011 (after emp): emp=%f\n", emp );
         vc[2] = cxmake( hel0 * pp / vmass, 0. );
         vc[5] = cxmake( hel0 * pvec3 * emp + hel * pt / pp * sqh, 0. );
         if( pt != 0. )