Skip to content

Commit

Permalink
Merge pull request #218 from phonopy/tuning
Browse files Browse the repository at this point in the history
Optimize get_fc3_sum -> get_fc3_sum_blas_like
  • Loading branch information
atztogo authored Apr 18, 2024
2 parents 472d564 + 400fe29 commit 427e004
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 41 deletions.
4 changes: 2 additions & 2 deletions c/imag_self_energy_with_g.c
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ void ise_imag_self_energy_at_triplet(
const long triplet[3], const long triplet_weight, const double *g1,
const double *g2_3, const long (*g_pos)[4], const long num_g_pos,
const double *temperatures, const long num_temps,
const double cutoff_frequency, const long openmp_possible,
const double cutoff_frequency, const long openmp_per_triplets,
const long at_a_frequency_point) {
long i, j;
double *n1, *n2, *ise_at_g_pos;
Expand All @@ -238,7 +238,7 @@ void ise_imag_self_energy_at_triplet(
}

#ifdef _OPENMP
#pragma omp parallel for private(j, g_pos_3) if (openmp_possible)
#pragma omp parallel for private(j, g_pos_3) if (!openmp_per_triplets)
#endif
for (i = 0; i < num_g_pos; i++) {
if (at_a_frequency_point) {
Expand Down
5 changes: 2 additions & 3 deletions c/pp_collision.c
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ void ppc_get_pp_collision(

tpl_set_relative_grid_address(tp_relative_grid_address,
relative_grid_address, 2);

#ifdef _OPENMP
#pragma omp parallel for schedule(guided) private( \
g, g_zero) if (openmp_per_triplets)
Expand Down Expand Up @@ -234,12 +233,12 @@ static void get_collision(
fc3_normal_squared, num_band0, num_band, g_pos, num_g_pos, frequencies,
eigenvectors, triplet, bzgrid, fc3, is_compact_fc3, atom_triplets,
masses, band_indices, symmetrize_fc3_q, cutoff_frequency, 0, 0,
1 - openmp_per_triplets);
openmp_per_triplets);

ise_imag_self_energy_at_triplet(
ise, num_band0, num_band, fc3_normal_squared, frequencies, triplet,
triplet_weight, g, g + num_band_prod, g_pos, num_g_pos, temperatures,
num_temps, cutoff_frequency, 1 - openmp_per_triplets, 0);
num_temps, cutoff_frequency, openmp_per_triplets, 0);

free(fc3_normal_squared);
fc3_normal_squared = NULL;
Expand Down
14 changes: 5 additions & 9 deletions c/real_to_reciprocal.c
Original file line number Diff line number Diff line change
Expand Up @@ -149,15 +149,15 @@ static void real_to_reciprocal_legacy(
const long openmp_per_triplets) {
long i, j, k, l, m, n, ijk;
long num_patom, num_satom, num_band;
lapack_complex_double fc3_rec_elem[27], fc3_rec;
lapack_complex_double fc3_rec_elem[27];

num_patom = atom_triplets->multi_dims[1];
num_satom = atom_triplets->multi_dims[0];
num_band = num_patom * 3;

#ifdef _OPENMP
#pragma omp parallel for private(i, j, k, l, m, n, fc3_rec_elem, \
fc3_rec) if (!openmp_per_triplets)
#pragma omp parallel for private(i, j, k, l, m, n, \
fc3_rec_elem) if (!openmp_per_triplets)
#endif
for (ijk = 0; ijk < num_patom * num_patom * num_patom; ijk++) {
i = ijk / (num_patom * num_patom);
Expand All @@ -170,14 +170,10 @@ static void real_to_reciprocal_legacy(
for (l = 0; l < 3; l++) {
for (m = 0; m < 3; m++) {
for (n = 0; n < 3; n++) {
fc3_rec = phonoc_complex_prod(
fc3_rec_elem[l * 9 + m * 3 + n], pre_phase_factors[i]);
fc3_reciprocal[(i * 3 + l) * num_band * num_band +
(j * 3 + m) * num_band + k * 3 + n] =
sum_lapack_complex_double(
fc3_reciprocal[(i * 3 + l) * num_band * num_band +
(j * 3 + m) * num_band + k * 3 + n],
fc3_rec);
phonoc_complex_prod(fc3_rec_elem[l * 9 + m * 3 + n],
pre_phase_factors[i]);
}
}
}
Expand Down
121 changes: 94 additions & 27 deletions c/reciprocal_to_normal.c
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,11 @@ static double get_fc3_sum_blas(const lapack_complex_double *e0,
const lapack_complex_double *e2,
const lapack_complex_double *fc3_reciprocal,
const long num_band);
static double get_fc3_sum_blas_like(const lapack_complex_double *e0,
const lapack_complex_double *e1,
const lapack_complex_double *e2,
const lapack_complex_double *fc3_reciprocal,
const long num_band);
void reciprocal_to_normal_squared(
double *fc3_normal_squared, const long (*g_pos)[4], const long num_g_pos,
const lapack_complex_double *fc3_reciprocal, const double *freqs0,
Expand All @@ -68,7 +73,7 @@ void reciprocal_to_normal_squared(
const lapack_complex_double *eigvecs2, const double *masses,
const long *band_indices, const long num_band,
const double cutoff_frequency, const long openmp_per_triplets) {
long i, j, num_atom;
long i, j, ij, num_atom, use_multithreaded_blas;
double *inv_sqrt_masses;
lapack_complex_double *e0, *e1, *e2;

Expand All @@ -95,26 +100,26 @@ void reciprocal_to_normal_squared(
e2 = e1 + num_band * num_band;

#ifdef _OPENMP
#pragma omp parallel for private(j) if (!openmp_per_triplets)
#pragma omp parallel for private(i, j) if (!openmp_per_triplets)
#endif
for (i = 0; i < num_band; i++) {
for (j = 0; j < num_band; j++) {
e0[i * num_band + j] = lapack_make_complex_double(
lapack_complex_double_real(eigvecs0[j * num_band + i]) *
inv_sqrt_masses[j],
lapack_complex_double_imag(eigvecs0[j * num_band + i]) *
inv_sqrt_masses[j]);
e1[i * num_band + j] = lapack_make_complex_double(
lapack_complex_double_real(eigvecs1[j * num_band + i]) *
inv_sqrt_masses[j],
lapack_complex_double_imag(eigvecs1[j * num_band + i]) *
inv_sqrt_masses[j]);
e2[i * num_band + j] = lapack_make_complex_double(
lapack_complex_double_real(eigvecs2[j * num_band + i]) *
inv_sqrt_masses[j],
lapack_complex_double_imag(eigvecs2[j * num_band + i]) *
inv_sqrt_masses[j]);
}
for (ij = 0; ij < num_band * num_band; ij++) {
i = ij / num_band;
j = ij % num_band;
e0[i * num_band + j] = lapack_make_complex_double(
lapack_complex_double_real(eigvecs0[j * num_band + i]) *
inv_sqrt_masses[j],
lapack_complex_double_imag(eigvecs0[j * num_band + i]) *
inv_sqrt_masses[j]);
e1[i * num_band + j] = lapack_make_complex_double(
lapack_complex_double_real(eigvecs1[j * num_band + i]) *
inv_sqrt_masses[j],
lapack_complex_double_imag(eigvecs1[j * num_band + i]) *
inv_sqrt_masses[j]);
e2[i * num_band + j] = lapack_make_complex_double(
lapack_complex_double_real(eigvecs2[j * num_band + i]) *
inv_sqrt_masses[j],
lapack_complex_double_imag(eigvecs2[j * num_band + i]) *
inv_sqrt_masses[j]);
}

free(inv_sqrt_masses);
Expand All @@ -131,20 +136,39 @@ void reciprocal_to_normal_squared(
loopStartCPUTime = clock();
#endif

#ifdef MULTITHREADED_BLAS
if (openmp_per_triplets) {
use_multithreaded_blas = 0;
} else {
use_multithreaded_blas = 1;
}
#else
use_multithreaded_blas = 0;
#ifdef _OPENMP
#pragma omp parallel for if (!openmp_per_triplets)
#endif
#endif
for (i = 0; i < num_g_pos; i++) {
if (freqs0[band_indices[g_pos[i][0]]] > cutoff_frequency &&
freqs1[g_pos[i][1]] > cutoff_frequency &&
freqs2[g_pos[i][2]] > cutoff_frequency) {
fc3_normal_squared[g_pos[i][3]] =
get_fc3_sum(e0 + band_indices[g_pos[i][0]] * num_band,
e1 + g_pos[i][1] * num_band,
e2 + g_pos[i][2] * num_band, fc3_reciprocal,
num_band) /
(freqs0[band_indices[g_pos[i][0]]] * freqs1[g_pos[i][1]] *
freqs2[g_pos[i][2]]);
if (use_multithreaded_blas) {
fc3_normal_squared[g_pos[i][3]] =
get_fc3_sum_blas(e0 + band_indices[g_pos[i][0]] * num_band,
e1 + g_pos[i][1] * num_band,
e2 + g_pos[i][2] * num_band,
fc3_reciprocal, num_band) /
(freqs0[band_indices[g_pos[i][0]]] * freqs1[g_pos[i][1]] *
freqs2[g_pos[i][2]]);
} else {
fc3_normal_squared[g_pos[i][3]] =
get_fc3_sum_blas_like(
e0 + band_indices[g_pos[i][0]] * num_band,
e1 + g_pos[i][1] * num_band,
e2 + g_pos[i][2] * num_band, fc3_reciprocal, num_band) /
(freqs0[band_indices[g_pos[i][0]]] * freqs1[g_pos[i][1]] *
freqs2[g_pos[i][2]]);
}
} else {
fc3_normal_squared[g_pos[i][3]] = 0;
}
Expand Down Expand Up @@ -234,3 +258,46 @@ static double get_fc3_sum_blas(const lapack_complex_double *e0,
lapack_complex_double_imag(retval) *
lapack_complex_double_imag(retval);
}

static double get_fc3_sum_blas_like(const lapack_complex_double *e0,
const lapack_complex_double *e1,
const lapack_complex_double *e2,
const lapack_complex_double *fc3_reciprocal,
const long num_band) {
long i, j;
double sum_real, sum_imag, retval_real, retval_imag;
lapack_complex_double *e_12, fc3_e_12, fc3_e_012;

e_12 = (lapack_complex_double *)malloc(sizeof(lapack_complex_double) *
num_band * num_band);

for (i = 0; i < num_band; i++) {
memcpy(e_12 + i * num_band, e2, 16 * num_band);
for (j = 0; j < num_band; j++) {
e_12[i * num_band + j] =
phonoc_complex_prod(e1[i], e_12[i * num_band + j]);
}
}

retval_real = 0;
retval_imag = 0;
for (i = 0; i < num_band; i++) {
sum_real = 0;
sum_imag = 0;
for (j = 0; j < num_band * num_band; j++) {
fc3_e_12 = phonoc_complex_prod(
fc3_reciprocal[i * num_band * num_band + j], e_12[j]);
sum_real += lapack_complex_double_real(fc3_e_12);
sum_imag += lapack_complex_double_imag(fc3_e_12);
}
fc3_e_012 = phonoc_complex_prod(
e0[i], lapack_make_complex_double(sum_real, sum_imag));
retval_real += lapack_complex_double_real(fc3_e_012);
retval_imag += lapack_complex_double_imag(fc3_e_012);
}

free(e_12);
e_12 = NULL;

return retval_real * retval_real + retval_imag * retval_imag;
}

0 comments on commit 427e004

Please sign in to comment.