performance tweaks of the batch correlate and inner product functions. (

gwastro#1124) * slightly increase performance * add mfilter part * minor style improvements * fix start * update unittest
geoffrey4444 · Oct 5, 2016 · 453ab80 · 453ab80
1 parent 68d2fc2
commit 453ab80
Show file tree

Hide file tree

Showing 7 changed files with 92 additions and 54 deletions.
diff --git a/pycbc/filter/matchedfilter.py b/pycbc/filter/matchedfilter.py
@@ -1420,6 +1420,8 @@ def _process_batch(self):
         self.corr[self.block_id].execute(stilde)
         self.ifts[mid].execute()
 
+        self.block_id += 1
+
         snr = numpy.zeros(len(tgroup), dtype=numpy.complex64)
         time = numpy.zeros(len(tgroup), dtype=numpy.float64)
         templates = numpy.zeros(len(tgroup), dtype=numpy.uint64)
@@ -1437,27 +1439,28 @@ def _process_batch(self):
         # Find the peaks in our SNR times series from the various templates
         i = 0
         for htilde in tgroup:
-            m, l = htilde.out[seg].abs_max_loc()
+            l = htilde.out[seg].abs_arg_max()
 
             sgm = htilde.sigmasq(psd)
             norm = 4.0 * htilde.delta_f / (sgm ** 0.5)
 
+            l += valid_start
+            snrv = numpy.array([htilde.out[l]])
+
             # If nothing is above threshold we can exit this template
-            s = m * norm
+            s = abs(snrv[0]) * norm
             if s < self.snr_threshold:
                 continue    
 
-            time[i] += float(l) / self.data.sample_rate            
-            l += valid_start
+            time[i] += float(l - valid_start) / self.data.sample_rate            
 
             # We have an SNR so high that we will drop the entire analysis 
             # of this chunk of time!
             if self.snr_abort_threshold is not None and s > self.snr_abort_threshold:
                 logging.info("We are seeing some *really* high SNRs, lets"
-                             "assume they aren't signals and just give up")
+                             " assume they aren't signals and just give up")
                 return False, []
 
-            snrv = numpy.array([htilde.out[l]])
             veto_info.append((snrv, norm, l, htilde, stilde))
 
             snr[i] = snrv[0] * norm
@@ -1481,7 +1484,6 @@ def _process_batch(self):
         for key in tkeys:
             result[key] = numpy.array(result[key])
 
-        self.block_id += 1
         return result, veto_info  
 
 __all__ = ['match', 'matched_filter', 'sigmasq', 'sigma', 'get_cutoff_indices',

diff --git a/pycbc/filter/matchedfilter_cpu.py b/pycbc/filter/matchedfilter_cpu.py
@@ -29,6 +29,7 @@
 from .matchedfilter import _BaseCorrelator
 
 batch_correlator_code = """
+    #pragma omp parallel for
     for (int i=0; i<num_vectors; i++){
         std::complex<float>* xp = (std::complex<float>*) x[i];
         std::complex<float>* zp = (std::complex<float>*) z[i];

diff --git a/pycbc/types/array.py b/pycbc/types/array.py
@@ -46,7 +46,7 @@
 # we should restrict any functions that do not allow an
 # array of uint32 integers
 _ALLOWED_DTYPES = [_numpy.float32, _numpy.float64, _numpy.complex64,
-                   _numpy.complex128, _numpy.uint32]
+                   _numpy.complex128, _numpy.uint32, _numpy.int32, _numpy.int]
 _ALLOWED_SCALARS = [int, long, float, complex] + _ALLOWED_DTYPES
 
 def _convert_to_scheme(ary):
@@ -674,6 +674,11 @@ def max(self):
     def max_loc(self):
         """Return the maximum value in the array along with the index location """
 
+    @_convert
+    @schemed(BACKEND_PREFIX)
+    def abs_arg_max(self):
+        """ Return location of the maximum argument max """
+
     @_convert
     @schemed(BACKEND_PREFIX)
     def abs_max_loc(self):
@@ -808,8 +813,10 @@ def precision(self):
     def kind(self):
         if self.dtype == float32 or self.dtype == float64:
             return 'real'
+        elif self.dtype == complex64 or self.dtype == complex128:
+            return 'complex'
         else:
-            return 'complex'    
+            return 'unknown'
 
     @property
     @_convert

diff --git a/pycbc/types/array_cpu.py b/pycbc/types/array_cpu.py
@@ -27,6 +27,10 @@
 from array import common_kind, complex128, float64
 import aligned as _algn
 from scipy.linalg import blas
+from scipy.weave import inline
+from pycbc.opt import omp_libs, omp_flags
+from pycbc import WEAVE_FLAGS
+from pycbc.types import real_same_precision_as
 
 def zeros(length, dtype=_np.float64):
     return _algn.zeros(length, dtype=dtype)
@@ -41,12 +45,45 @@ def dot(self, other):
     return _np.dot(self._data,other)     
 
 def min(self):
-    return self.data.min()  
+    return self.data.min()
+
+code_abs_arg_max = """
+float val = 0;
+int l = 0;
+for (int i=0; i<N; i++){
+    float mag = data[i*2] * data[i*2] + data[i*2+1] * data[i*2+1];
+    if ( mag > val){
+        l = i;
+        val = mag;
+    }
+}
+loc[0] = l;
+"""
+code_flags = [WEAVE_FLAGS + '-march=native -O3 -w'] + omp_flags
+
+
+def abs_arg_max(self):
+    if self.kind == 'real':
+        return _np.argmax(self.data)
+    else:
+        data = _np.array(self._data,
+                         copy=False).view(real_same_precision_as(self))
+        loc = _np.array([0])
+        N = len(self)
+        inline(code_abs_arg_max, ['data', 'loc', 'N'], libraries=omp_libs,
+               extra_compile_args=code_flags)
+        return loc[0]
 
 def abs_max_loc(self):
-    tmp = abs(self.data)
-    ind = _np.argmax(tmp)
-    return tmp[ind], ind
+    if self.kind == 'real':
+        tmp = abs(self.data)
+        ind = _np.argmax(tmp)
+        return tmp[ind], ind
+    else:
+        tmp = self.data.real ** 2.0
+        tmp += self.data.imag ** 2.0
+        ind = _np.argmax(tmp)
+        return tmp[ind] ** 0.5, ind
 
 def cumsum(self):
     return self.data.cumsum()
@@ -74,14 +111,37 @@ def weighted_inner(self, other, weight):
         acum_dtype = float64
 
     return _np.sum(self.data.conj() * other / weight, dtype=acum_dtype)
-
+
+
+inner_code = """
+double value = 0;
+
+#pragma omp parallel for reduction(+:value)
+for (int i=0; i<N; i++){
+    float val = x[i] * y[i];
+    value += val;
+}
+total[0] = value;
+"""
+
+
+def inner_inline_real(self, other):
+    x = _np.array(self._data, copy=False)
+    y = _np.array(other, copy=False)
+    total = _np.array([0])
+    N = len(self)
+    inline(inner_code, ['x', 'y', 'total', 'N'], libraries=omp_libs,
+           extra_compile_args=code_flags)
+    return total[0]
+
 def inner(self, other):
     """ Return the inner product of the array with complex conjugation.
     """
     cdtype = common_kind(self.dtype, other.dtype)
     if cdtype.kind == 'c':
         acum_dtype = complex128
     else:
+        return inner_inline_real(self, other)
         acum_dtype = float64
     return _np.sum(self.data.conj() * other, dtype=acum_dtype)
     #return _np.vdot(self.data, other)

diff --git a/test/test_array.py b/test/test_array.py
@@ -200,22 +200,13 @@ def test_numpy_init(self):
             self.assertEqual(out5[0],1)
             self.assertEqual(out5[1],2)
             self.assertEqual(out5[2],3)
-            self.assertTrue(out5.dtype==numpy.float64)
-
-            # We shouldn't be able to copy it though
-            self.assertRaises(TypeError,Array,in5, copy=False)
 
             # Just checking that we can make an empty array correctly
             empty = numpy.array([])
             out6 = Array(empty)
             self.assertTrue(out6.dtype==numpy.float64)
             self.assertRaises(IndexError, out6.__getitem__,0)
 
-        if self.scheme != 'cpu':
-            self.assertRaises(TypeError, Array, in4, copy=False)
-        self.assertRaises(TypeError, Array, in5, copy=False)
-
-
     def test_array_init(self):
         # this array is made outside the context so we can check that an error is raised when copy = false in a GPU scheme
         cpuarray = Array([1,2,3])
@@ -290,8 +281,6 @@ def test_array_init(self):
             self.assertEqual(out4[2],3)
             self.assertTrue(out4.dtype==self.dtype)
 
-            self.assertRaises(TypeError, Array,in1, dtype=numpy.int32)
-
             # Check for copy=false
             in3 = Array([5,3,1],dtype=self.dtype)
             out5 = Array(in3,copy=False)
@@ -411,10 +400,9 @@ def test_list_init(self):
 
             else:
                 self.assertRaises(TypeError, Array,[5+0j, 3+0j, 1+0j],dtype=self.dtype)
-            self.assertRaises(TypeError, Array,[1,2,3], dtype=numpy.int32)
-
+
             #Also, when it is unspecified
-            out3 = Array([5,3,1])
+            out3 = Array([5.0,3,1])
 
             self.assertTrue(type(out3._scheme) == type(self.context))
             self.assertTrue(type(out3._data) is SchemeArray)

diff --git a/test/test_frequencyseries.py b/test/test_frequencyseries.py
@@ -200,24 +200,15 @@ def test_numpy_init(self):
             self.assertEqual(out5[0],1)
             self.assertEqual(out5[1],2)
             self.assertEqual(out5[2],3)
-            self.assertTrue(out5.dtype==numpy.float64)
             self.assertEqual(out5.delta_f,0.1)
             self.assertEqual(out5._epoch, self.epoch)
 
-            # We shouldn't be able to copy it though
-            self.assertRaises(TypeError,FrequencySeries,in5, 0.1, copy=False)
-
             # Finally, just checking a few things specific to FrequencySeries
             inbad = numpy.array([],dtype=float64)
             self.assertRaises(ValueError, FrequencySeries, in1, -1)
             self.assertRaises(ValueError, FrequencySeries, inbad, .1)
             self.assertRaises(TypeError, FrequencySeries, in1, .1, epoch=(5,1))
 
-        if self.scheme != 'cpu':
-            self.assertRaises(TypeError, FrequencySeries, in4, 0.1, copy=False)
-        self.assertRaises(TypeError, FrequencySeries, in5,0.1, copy=False)
-
-
     def test_array_init(self):
         # this array is made outside the context so we can check that an error is raised when copy = false in a GPU scheme
         cpuarray = Array([1,2,3])
@@ -300,8 +291,6 @@ def test_array_init(self):
             self.assertEqual(out4.delta_f, 0.1)
             self.assertEqual(out4._epoch, self.epoch)
 
-            self.assertRaises(TypeError, FrequencySeries,in1,0.1, dtype=numpy.int32)
-
             # Check for copy=false
             in3 = Array([5,3,1],dtype=self.dtype)
             out5 = FrequencySeries(in3,0.1,copy=False, epoch=self.epoch)
@@ -378,10 +367,9 @@ def test_list_init(self):
 
             else:
                 self.assertRaises(TypeError, FrequencySeries,[5+0j, 3+0j, 1+0j], 0.1, dtype=self.dtype)
-            self.assertRaises(TypeError, FrequencySeries,[1,2,3],0.1, dtype=numpy.int32)
 
             #Also, when it is unspecified
-            out3 = FrequencySeries([5,3,1],0.1,epoch=self.epoch)
+            out3 = FrequencySeries([5.0,3,1],0.1,epoch=self.epoch)
 
             self.assertTrue(type(out3._scheme) == type(self.context))
             self.assertTrue(type(out3._data) is SchemeArray)
@@ -392,7 +380,7 @@ def test_list_init(self):
             self.assertEqual(out3.delta_f, 0.1)
             self.assertEqual(out3._epoch, self.epoch)
 
-            out4 = FrequencySeries([5+0j,3+0j,1+0j],0.1,epoch = self.epoch)
+            out4 = FrequencySeries([5.0+0j,3+0j,1+0j],0.1,epoch = self.epoch)
 
             self.assertTrue(type(out4._scheme) == type(self.context))
             self.assertTrue(type(out4._data) is SchemeArray)

diff --git a/test/test_timeseries.py b/test/test_timeseries.py
@@ -199,24 +199,19 @@ def test_numpy_init(self):
             self.assertEqual(out5[0],1)
             self.assertEqual(out5[1],2)
             self.assertEqual(out5[2],3)
-            self.assertTrue(out5.dtype==numpy.float64)
+            #self.assertTrue(out5.dtype==numpy.float64)
             self.assertEqual(out5.delta_t,0.1)
             self.assertEqual(out5.start_time, self.epoch)
 
             # We shouldn't be able to copy it though
-            self.assertRaises(TypeError,TimeSeries,in5, 0.1, copy=False)
+            #self.assertRaises(TypeError,TimeSeries,in5, 0.1, copy=False)
 
             # Finally, just checking a few things specific to timeseries
             inbad = numpy.array([],dtype=float64)
             self.assertRaises(ValueError, TimeSeries, in1, -1)
             self.assertRaises(ValueError, TimeSeries, inbad, .1)
             self.assertRaises(TypeError, TimeSeries, in1, .1, epoch=(5,1))
 
-        if self.scheme != 'cpu':
-            self.assertRaises(TypeError, TimeSeries, in4, 0.1, copy=False)
-        self.assertRaises(TypeError, TimeSeries, in5,0.1, copy=False)
-
-
     def test_array_init(self):
         # this array is made outside the context so we can check that an error is raised when copy = false in a GPU scheme
         cpuarray = Array([1,2,3])
@@ -299,8 +294,6 @@ def test_array_init(self):
             self.assertEqual(out4.delta_t, 0.1)
             self.assertEqual(out4.start_time, self.epoch)
 
-            self.assertRaises(TypeError, TimeSeries,in1,0.1, dtype=numpy.int32)
-
             # Check for copy=false
             in3 = Array([5,3,1],dtype=self.dtype)
             out5 = TimeSeries(in3,0.1,copy=False,epoch=self.epoch)
@@ -364,7 +357,7 @@ def test_list_init(self):
                 #self.assertTrue(out1.kind == 'complex')
 
             if self.kind == 'complex':
-                out2 = TimeSeries([5+0j,3+0j,1+0j], 0.1, dtype=self.dtype, epoch=self.epoch)
+                out2 = TimeSeries([5.0+0j,3+0j,1+0j], 0.1, dtype=self.dtype, epoch=self.epoch)
 
                 self.assertTrue(type(out2._scheme) == type(self.context))
                 self.assertTrue(type(out2._data) is SchemeArray)
@@ -377,10 +370,9 @@ def test_list_init(self):
 
             else:
                 self.assertRaises(TypeError, TimeSeries,[5+0j, 3+0j, 1+0j], 0.1, dtype=self.dtype)
-            self.assertRaises(TypeError, TimeSeries,[1,2,3],0.1, dtype=numpy.int32)
 
             #Also, when it is unspecified
-            out3 = TimeSeries([5,3,1],0.1,epoch=self.epoch)
+            out3 = TimeSeries([5.0,3,1],0.1,epoch=self.epoch)
 
             self.assertTrue(type(out3._scheme) == type(self.context))
             self.assertTrue(type(out3._data) is SchemeArray)