diff --git a/c/imag_self_energy_with_g.c b/c/imag_self_energy_with_g.c index d6d32e05..d9aac798 100644 --- a/c/imag_self_energy_with_g.c +++ b/c/imag_self_energy_with_g.c @@ -221,7 +221,7 @@ void ise_imag_self_energy_at_triplet( const long triplet[3], const long triplet_weight, const double *g1, const double *g2_3, const long (*g_pos)[4], const long num_g_pos, const double *temperatures, const long num_temps, - const double cutoff_frequency, const long openmp_possible, + const double cutoff_frequency, const long openmp_per_triplets, const long at_a_frequency_point) { long i, j; double *n1, *n2, *ise_at_g_pos; @@ -238,7 +238,7 @@ void ise_imag_self_energy_at_triplet( } #ifdef _OPENMP -#pragma omp parallel for private(j, g_pos_3) if (openmp_possible) +#pragma omp parallel for private(j, g_pos_3) if (!openmp_per_triplets) #endif for (i = 0; i < num_g_pos; i++) { if (at_a_frequency_point) { diff --git a/c/pp_collision.c b/c/pp_collision.c index 6258f083..9cd6b9e8 100644 --- a/c/pp_collision.c +++ b/c/pp_collision.c @@ -96,7 +96,6 @@ void ppc_get_pp_collision( tpl_set_relative_grid_address(tp_relative_grid_address, relative_grid_address, 2); - #ifdef _OPENMP #pragma omp parallel for schedule(guided) private( \ g, g_zero) if (openmp_per_triplets) @@ -234,12 +233,12 @@ static void get_collision( fc3_normal_squared, num_band0, num_band, g_pos, num_g_pos, frequencies, eigenvectors, triplet, bzgrid, fc3, is_compact_fc3, atom_triplets, masses, band_indices, symmetrize_fc3_q, cutoff_frequency, 0, 0, - 1 - openmp_per_triplets); + openmp_per_triplets); ise_imag_self_energy_at_triplet( ise, num_band0, num_band, fc3_normal_squared, frequencies, triplet, triplet_weight, g, g + num_band_prod, g_pos, num_g_pos, temperatures, - num_temps, cutoff_frequency, 1 - openmp_per_triplets, 0); + num_temps, cutoff_frequency, openmp_per_triplets, 0); free(fc3_normal_squared); fc3_normal_squared = NULL; diff --git a/c/real_to_reciprocal.c b/c/real_to_reciprocal.c index 00f88068..29f9212b 100644 --- a/c/real_to_reciprocal.c +++ b/c/real_to_reciprocal.c @@ -149,15 +149,15 @@ static void real_to_reciprocal_legacy( const long openmp_per_triplets) { long i, j, k, l, m, n, ijk; long num_patom, num_satom, num_band; - lapack_complex_double fc3_rec_elem[27], fc3_rec; + lapack_complex_double fc3_rec_elem[27]; num_patom = atom_triplets->multi_dims[1]; num_satom = atom_triplets->multi_dims[0]; num_band = num_patom * 3; #ifdef _OPENMP -#pragma omp parallel for private(i, j, k, l, m, n, fc3_rec_elem, \ - fc3_rec) if (!openmp_per_triplets) +#pragma omp parallel for private(i, j, k, l, m, n, \ + fc3_rec_elem) if (!openmp_per_triplets) #endif for (ijk = 0; ijk < num_patom * num_patom * num_patom; ijk++) { i = ijk / (num_patom * num_patom); @@ -170,14 +170,10 @@ static void real_to_reciprocal_legacy( for (l = 0; l < 3; l++) { for (m = 0; m < 3; m++) { for (n = 0; n < 3; n++) { - fc3_rec = phonoc_complex_prod( - fc3_rec_elem[l * 9 + m * 3 + n], pre_phase_factors[i]); fc3_reciprocal[(i * 3 + l) * num_band * num_band + (j * 3 + m) * num_band + k * 3 + n] = - sum_lapack_complex_double( - fc3_reciprocal[(i * 3 + l) * num_band * num_band + - (j * 3 + m) * num_band + k * 3 + n], - fc3_rec); + phonoc_complex_prod(fc3_rec_elem[l * 9 + m * 3 + n], + pre_phase_factors[i]); } } } diff --git a/c/reciprocal_to_normal.c b/c/reciprocal_to_normal.c index bb2b360f..ca7a277c 100644 --- a/c/reciprocal_to_normal.c +++ b/c/reciprocal_to_normal.c @@ -59,6 +59,11 @@ static double get_fc3_sum_blas(const lapack_complex_double *e0, const lapack_complex_double *e2, const lapack_complex_double *fc3_reciprocal, const long num_band); +static double get_fc3_sum_blas_like(const lapack_complex_double *e0, + const lapack_complex_double *e1, + const lapack_complex_double *e2, + const lapack_complex_double *fc3_reciprocal, + const long num_band); void reciprocal_to_normal_squared( double *fc3_normal_squared, const long (*g_pos)[4], const long num_g_pos, const lapack_complex_double *fc3_reciprocal, const double *freqs0, @@ -68,7 +73,7 @@ void reciprocal_to_normal_squared( const lapack_complex_double *eigvecs2, const double *masses, const long *band_indices, const long num_band, const double cutoff_frequency, const long openmp_per_triplets) { - long i, j, num_atom; + long i, j, ij, num_atom, use_multithreaded_blas; double *inv_sqrt_masses; lapack_complex_double *e0, *e1, *e2; @@ -95,26 +100,26 @@ void reciprocal_to_normal_squared( e2 = e1 + num_band * num_band; #ifdef _OPENMP -#pragma omp parallel for private(j) if (!openmp_per_triplets) +#pragma omp parallel for private(i, j) if (!openmp_per_triplets) #endif - for (i = 0; i < num_band; i++) { - for (j = 0; j < num_band; j++) { - e0[i * num_band + j] = lapack_make_complex_double( - lapack_complex_double_real(eigvecs0[j * num_band + i]) * - inv_sqrt_masses[j], - lapack_complex_double_imag(eigvecs0[j * num_band + i]) * - inv_sqrt_masses[j]); - e1[i * num_band + j] = lapack_make_complex_double( - lapack_complex_double_real(eigvecs1[j * num_band + i]) * - inv_sqrt_masses[j], - lapack_complex_double_imag(eigvecs1[j * num_band + i]) * - inv_sqrt_masses[j]); - e2[i * num_band + j] = lapack_make_complex_double( - lapack_complex_double_real(eigvecs2[j * num_band + i]) * - inv_sqrt_masses[j], - lapack_complex_double_imag(eigvecs2[j * num_band + i]) * - inv_sqrt_masses[j]); - } + for (ij = 0; ij < num_band * num_band; ij++) { + i = ij / num_band; + j = ij % num_band; + e0[i * num_band + j] = lapack_make_complex_double( + lapack_complex_double_real(eigvecs0[j * num_band + i]) * + inv_sqrt_masses[j], + lapack_complex_double_imag(eigvecs0[j * num_band + i]) * + inv_sqrt_masses[j]); + e1[i * num_band + j] = lapack_make_complex_double( + lapack_complex_double_real(eigvecs1[j * num_band + i]) * + inv_sqrt_masses[j], + lapack_complex_double_imag(eigvecs1[j * num_band + i]) * + inv_sqrt_masses[j]); + e2[i * num_band + j] = lapack_make_complex_double( + lapack_complex_double_real(eigvecs2[j * num_band + i]) * + inv_sqrt_masses[j], + lapack_complex_double_imag(eigvecs2[j * num_band + i]) * + inv_sqrt_masses[j]); } free(inv_sqrt_masses); @@ -131,20 +136,39 @@ void reciprocal_to_normal_squared( loopStartCPUTime = clock(); #endif +#ifdef MULTITHREADED_BLAS + if (openmp_per_triplets) { + use_multithreaded_blas = 0; + } else { + use_multithreaded_blas = 1; + } +#else + use_multithreaded_blas = 0; #ifdef _OPENMP #pragma omp parallel for if (!openmp_per_triplets) +#endif #endif for (i = 0; i < num_g_pos; i++) { if (freqs0[band_indices[g_pos[i][0]]] > cutoff_frequency && freqs1[g_pos[i][1]] > cutoff_frequency && freqs2[g_pos[i][2]] > cutoff_frequency) { - fc3_normal_squared[g_pos[i][3]] = - get_fc3_sum(e0 + band_indices[g_pos[i][0]] * num_band, - e1 + g_pos[i][1] * num_band, - e2 + g_pos[i][2] * num_band, fc3_reciprocal, - num_band) / - (freqs0[band_indices[g_pos[i][0]]] * freqs1[g_pos[i][1]] * - freqs2[g_pos[i][2]]); + if (use_multithreaded_blas) { + fc3_normal_squared[g_pos[i][3]] = + get_fc3_sum_blas(e0 + band_indices[g_pos[i][0]] * num_band, + e1 + g_pos[i][1] * num_band, + e2 + g_pos[i][2] * num_band, + fc3_reciprocal, num_band) / + (freqs0[band_indices[g_pos[i][0]]] * freqs1[g_pos[i][1]] * + freqs2[g_pos[i][2]]); + } else { + fc3_normal_squared[g_pos[i][3]] = + get_fc3_sum_blas_like( + e0 + band_indices[g_pos[i][0]] * num_band, + e1 + g_pos[i][1] * num_band, + e2 + g_pos[i][2] * num_band, fc3_reciprocal, num_band) / + (freqs0[band_indices[g_pos[i][0]]] * freqs1[g_pos[i][1]] * + freqs2[g_pos[i][2]]); + } } else { fc3_normal_squared[g_pos[i][3]] = 0; } @@ -234,3 +258,46 @@ static double get_fc3_sum_blas(const lapack_complex_double *e0, lapack_complex_double_imag(retval) * lapack_complex_double_imag(retval); } + +static double get_fc3_sum_blas_like(const lapack_complex_double *e0, + const lapack_complex_double *e1, + const lapack_complex_double *e2, + const lapack_complex_double *fc3_reciprocal, + const long num_band) { + long i, j; + double sum_real, sum_imag, retval_real, retval_imag; + lapack_complex_double *e_12, fc3_e_12, fc3_e_012; + + e_12 = (lapack_complex_double *)malloc(sizeof(lapack_complex_double) * + num_band * num_band); + + for (i = 0; i < num_band; i++) { + memcpy(e_12 + i * num_band, e2, 16 * num_band); + for (j = 0; j < num_band; j++) { + e_12[i * num_band + j] = + phonoc_complex_prod(e1[i], e_12[i * num_band + j]); + } + } + + retval_real = 0; + retval_imag = 0; + for (i = 0; i < num_band; i++) { + sum_real = 0; + sum_imag = 0; + for (j = 0; j < num_band * num_band; j++) { + fc3_e_12 = phonoc_complex_prod( + fc3_reciprocal[i * num_band * num_band + j], e_12[j]); + sum_real += lapack_complex_double_real(fc3_e_12); + sum_imag += lapack_complex_double_imag(fc3_e_12); + } + fc3_e_012 = phonoc_complex_prod( + e0[i], lapack_make_complex_double(sum_real, sum_imag)); + retval_real += lapack_complex_double_real(fc3_e_012); + retval_imag += lapack_complex_double_imag(fc3_e_012); + } + + free(e_12); + e_12 = NULL; + + return retval_real * retval_real + retval_imag * retval_imag; +}