From a7efb2120f1da0733332cc94dea508bcc63be7c4 Mon Sep 17 00:00:00 2001 From: Hanno Rein Date: Thu, 22 Aug 2024 14:59:31 -0400 Subject: [PATCH] AVX512 fallback synchronization fixed and unit test added. --- examples/whfast512_unittests/problem.c | 39 ++++ src/integrator_whfast512.c | 238 +++++++++++++------------ src/integrator_whfast512.h | 1 + 3 files changed, 168 insertions(+), 110 deletions(-) diff --git a/examples/whfast512_unittests/problem.c b/examples/whfast512_unittests/problem.c index d3af737ae..de00934b7 100644 --- a/examples/whfast512_unittests/problem.c +++ b/examples/whfast512_unittests/problem.c @@ -272,6 +272,7 @@ int test_restart(){ if (reb_simulation_integrate(r512, tmax)>0) return 0; if (reb_simulation_integrate(r512, 2.*tmax)>0) return 0; + remove("test.bin"); reb_simulation_save_to_file(r512, "test.bin"); if (reb_simulation_integrate(r512, 3.*tmax)>0) return 0; if (reb_simulation_integrate(r512, tmaxfinal)>0) return 0; @@ -298,6 +299,43 @@ int test_restart(){ return 1; } +// Only needed for unit testing +void reb_integrator_whfast512_synchronize_fallback(struct reb_simulation* const r); + +int test_synchronization_fallback(){ + remove("test.bin"); + struct reb_simulation* r512 = setup_sim(9); + + r512->integrator = REB_INTEGRATOR_WHFAST512; + r512->ri_whfast512.gr_potential = 1; + reb_simulation_save_to_file_interval(r512, "test.bin", 1.0); + if (reb_simulation_integrate(r512, 2.5)>0) return 0; + reb_simulation_free(r512); + + struct reb_simulation* r1 = reb_simulation_create_from_file("test.bin", 1); + struct reb_simulation* r2 = reb_simulation_create_from_file("test.bin", 1); + assert(r1->ri_whfast512.is_synchronized == 0); + assert(r2->ri_whfast512.is_synchronized == 0); + reb_simulation_synchronize(r1); + reb_integrator_whfast512_synchronize_fallback(r2); + assert(r1->ri_whfast512.is_synchronized == 1); + assert(r2->ri_whfast512.is_synchronized == 1); + + for (int i=0;iN;i++){ + double dx = fabs(r1->particles[i].x - r2->particles[i].x); + double dvx = fabs(r1->particles[i].vx - r2->particles[i].vx); + if (dx>1e-15 || dvx>1e-15){ + printf("Accuracy not met in synchronization fallback test.\n"); + printf("i=%i diff_x=%.16e diff_vx=%.16e\n",i,dx, dvx); + return 0; + } + } + + reb_simulation_free(r1); + reb_simulation_free(r2); + return 1; +} + int main(int argc, char* argv[]) { assert(test_basic()); assert(test_number_of_planets()); @@ -311,5 +349,6 @@ int main(int argc, char* argv[]) { assert(test_com()); assert(test_twobody()); assert(test_gr()); + assert(test_synchronization_fallback()); printf("All tests passed.\n"); } diff --git a/src/integrator_whfast512.c b/src/integrator_whfast512.c index 4f6aa83ea..f559792f0 100644 --- a/src/integrator_whfast512.c +++ b/src/integrator_whfast512.c @@ -38,8 +38,6 @@ #include "integrator_whfast.h" #include "integrator_whfast512.h" -#ifdef AVX512 - #ifdef PROF // Profiling counters double walltime_interaction=0;; @@ -48,6 +46,99 @@ double walltime_jump=0; double walltime_com=0; #endif +// Performs one full center of mass step (H_0) +static void reb_whfast512_com_step(struct reb_simulation* r, const double _dt){ +#ifdef PROF + struct reb_timeval time_beginning; + gettimeofday(&time_beginning,NULL); +#endif + const unsigned int N_systems = r->ri_whfast512.N_systems; + for (int s=0; sri_whfast512.p_jh0[s].x += _dt*r->ri_whfast512.p_jh0[s].vx; + r->ri_whfast512.p_jh0[s].y += _dt*r->ri_whfast512.p_jh0[s].vy; + r->ri_whfast512.p_jh0[s].z += _dt*r->ri_whfast512.p_jh0[s].vz; + } +#ifdef PROF + struct reb_timeval time_end; + gettimeofday(&time_end,NULL); + walltime_com += time_end.tv_sec-time_beginning.tv_sec+(time_end.tv_usec-time_beginning.tv_usec)/1e6; +#endif +} + +// Convert democratic heliocentric coordinates to inertial coordinates +// Note: this is only called at the end. Speed is not a concern. +static void democraticheliocentric_to_inertial_posvel(struct reb_simulation* r){ + struct reb_integrator_whfast512* const ri_whfast512 = &(r->ri_whfast512); + struct reb_particle* particles = r->particles; + struct reb_particle_avx512* p_jh = ri_whfast512->p_jh; + const unsigned int N_systems = ri_whfast512->N_systems; + const unsigned int p_per_system = 8/N_systems; + const unsigned int N_per_system = r->N/N_systems; + + +#ifdef AVX512 + double m[8]; + double x[8]; + double y[8]; + double z[8]; + double vx[8]; + double vy[8]; + double vz[8]; + _mm512_storeu_pd(&m, p_jh->m); + _mm512_storeu_pd(&x, p_jh->x); + _mm512_storeu_pd(&y, p_jh->y); + _mm512_storeu_pd(&z, p_jh->z); + _mm512_storeu_pd(&vx, p_jh->vx); + _mm512_storeu_pd(&vy, p_jh->vy); + _mm512_storeu_pd(&vz, p_jh->vz); +#else // AVX512 + // Fallback for synchronization for when AVX512 is not available + double* m = (double*)p_jh->m; + double* x = (double*)p_jh->x; + double* y = (double*)p_jh->y; + double* z = (double*)p_jh->z; + double* vx = p_jh->vx; + double* vy = p_jh->vy; + double* vz = p_jh->vz; +#endif // AVX512 + + for (unsigned s=0;sp_jh0[s].vx; + particles[s*N_per_system+i].vy = vy[s*p_per_system+(i-1)] + ri_whfast512->p_jh0[s].vy; + particles[s*N_per_system+i].vz = vz[s*p_per_system+(i-1)] + ri_whfast512->p_jh0[s].vz; + } + x0s /= ri_whfast512->p_jh0[s].m; + y0s /= ri_whfast512->p_jh0[s].m; + z0s /= ri_whfast512->p_jh0[s].m; + particles[s*N_per_system].x = ri_whfast512->p_jh0[s].x - x0s; + particles[s*N_per_system].y = ri_whfast512->p_jh0[s].y - y0s; + particles[s*N_per_system].z = ri_whfast512->p_jh0[s].z - z0s; + particles[s*N_per_system].vx = ri_whfast512->p_jh0[s].vx - vx0s; + particles[s*N_per_system].vy = ri_whfast512->p_jh0[s].vy - vy0s; + particles[s*N_per_system].vz = ri_whfast512->p_jh0[s].vz - vz0s; + for (unsigned int i=1; iri_whfast512); - struct reb_particle* particles = r->particles; - struct reb_particle_avx512* p_jh = ri_whfast512->p_jh; - const unsigned int N_systems = ri_whfast512->N_systems; - const unsigned int p_per_system = 8/N_systems; - const unsigned int N_per_system = r->N/N_systems; - - double m[8]; - double x[8]; - double y[8]; - double z[8]; - double vx[8]; - double vy[8]; - double vz[8]; - _mm512_storeu_pd(&m, p_jh->m); - _mm512_storeu_pd(&x, p_jh->x); - _mm512_storeu_pd(&y, p_jh->y); - _mm512_storeu_pd(&z, p_jh->z); - _mm512_storeu_pd(&vx, p_jh->vx); - _mm512_storeu_pd(&vy, p_jh->vy); - _mm512_storeu_pd(&vz, p_jh->vz); - - for (unsigned s=0;sp_jh0[s].vx; - particles[s*N_per_system+i].vy = vy[s*p_per_system+(i-1)] + ri_whfast512->p_jh0[s].vy; - particles[s*N_per_system+i].vz = vz[s*p_per_system+(i-1)] + ri_whfast512->p_jh0[s].vz; - } - x0s /= ri_whfast512->p_jh0[s].m; - y0s /= ri_whfast512->p_jh0[s].m; - z0s /= ri_whfast512->p_jh0[s].m; - particles[s*N_per_system].x = ri_whfast512->p_jh0[s].x - x0s; - particles[s*N_per_system].y = ri_whfast512->p_jh0[s].y - y0s; - particles[s*N_per_system].z = ri_whfast512->p_jh0[s].z - z0s; - particles[s*N_per_system].vx = ri_whfast512->p_jh0[s].vx - vx0s; - particles[s*N_per_system].vy = ri_whfast512->p_jh0[s].vy - vy0s; - particles[s*N_per_system].vz = ri_whfast512->p_jh0[s].vz - vz0s; - for (unsigned int i=1; iri_whfast512.N_systems; - for (int s=0; sri_whfast512.p_jh0[s].x += _dt*r->ri_whfast512.p_jh0[s].vx; - r->ri_whfast512.p_jh0[s].y += _dt*r->ri_whfast512.p_jh0[s].vy; - r->ri_whfast512.p_jh0[s].z += _dt*r->ri_whfast512.p_jh0[s].vz; - } -#ifdef PROF - struct reb_timeval time_end; - gettimeofday(&time_end,NULL); - walltime_com += time_end.tv_sec-time_beginning.tv_sec+(time_end.tv_usec-time_beginning.tv_usec)/1e6; -#endif -} - // Precalculate various constants and put them in 512 bit vectors. void static recalculate_constants(struct reb_simulation* r){ struct reb_integrator_whfast512* const ri_whfast512 = &(r->ri_whfast512); @@ -1025,10 +1038,10 @@ void reb_integrator_whfast512_part1(struct reb_simulation* const r){ // Synchronization routine. Called every time an output is needed. void reb_integrator_whfast512_synchronize(struct reb_simulation* const r){ +#ifdef AVX512 struct reb_integrator_whfast512* const ri_whfast512 = &(r->ri_whfast512); if (ri_whfast512->is_synchronized == 0){ const unsigned int N_systems = ri_whfast512->N_systems; -#ifdef AVX512 struct reb_particle_avx512* sync_pj = NULL; struct reb_particle sync_pj0[4]; if (ri_whfast512->recalculate_constants){ @@ -1054,42 +1067,47 @@ void reb_integrator_whfast512_synchronize(struct reb_simulation* const r){ }else{ ri_whfast512->is_synchronized = 1; } -#else // No AVX512 available - // Using WHFast as a workaround. - // Not bit-wise reproducible. + } +#else + reb_integrator_whfast512_synchronize_fallback(r); +#endif // AVX512 +} + +void reb_integrator_whfast512_synchronize_fallback(struct reb_simulation* const r){ + // No AVX512 available + // Using WHFast as a workaround. + // Not bit-wise reproducible. + struct reb_integrator_whfast512* const ri_whfast512 = &(r->ri_whfast512); + if (ri_whfast512->is_synchronized == 0){ reb_simulation_warning(r, "WHFast512 is not available. Synchronization is provided using WHFast and is not bit-compatible to WHFast512."); + const unsigned int N_systems = ri_whfast512->N_systems; const unsigned int p_per_system = 8/N_systems; const unsigned int N_per_system = r->N/N_systems; + double dt = r->dt; for (int s=0; sG = r->G; - for (unsigned int i=0;i<=N_per_system;i++){ - reb_simulation_add(rs, r->particles[s*N_per_system+i]); + double m0 = r->particles[s*N_per_system].m; + // 1/2 Kepler + for (unsigned int i=1;ip_jh->m[s*p_per_system+i-1]; + p.x = ri_whfast512->p_jh->x[s*p_per_system+i-1]; + p.y = ri_whfast512->p_jh->y[s*p_per_system+i-1]; + p.z = ri_whfast512->p_jh->z[s*p_per_system+i-1]; + p.vx = ri_whfast512->p_jh->vx[s*p_per_system+i-1]; + p.vy = ri_whfast512->p_jh->vy[s*p_per_system+i-1]; + p.vz = ri_whfast512->p_jh->vz[s*p_per_system+i-1]; + reb_whfast_kepler_solver(r, &p, m0, 0, dt/2.0); + ri_whfast512->p_jh->x[s*p_per_system+i-1] = p.x; + ri_whfast512->p_jh->y[s*p_per_system+i-1] = p.y; + ri_whfast512->p_jh->z[s*p_per_system+i-1] = p.z; + ri_whfast512->p_jh->vx[s*p_per_system+i-1] = p.vx; + ri_whfast512->p_jh->vy[s*p_per_system+i-1] = p.vy; + ri_whfast512->p_jh->vz[s*p_per_system+i-1] = p.vz; } - struct reb_integrator_whfast* const ri_whfast = &(rs->ri_whfast); - reb_integrator_whfast_init(rs); - ri_whfast->coordinates = REB_WHFAST_COORDINATES_DEMOCRATICHELIOCENTRIC; - ri_whfast->is_synchronized = 0; - ri_whfast->p_jh[0] = ri_whfast512->p_jh0[s]; - for (unsigned int i=1;i<=N_per_system;i++){ - ri_whfast->p_jh[i].m = ri_whfast512->p_jh->m[s*p_per_system+i-1]; - ri_whfast->p_jh[i].x = ri_whfast512->p_jh->x[s*p_per_system+i-1]; - ri_whfast->p_jh[i].y = ri_whfast512->p_jh->y[s*p_per_system+i-1]; - ri_whfast->p_jh[i].z = ri_whfast512->p_jh->z[s*p_per_system+i-1]; - ri_whfast->p_jh[i].vx = ri_whfast512->p_jh->vx[s*p_per_system+i-1]; - ri_whfast->p_jh[i].vy = ri_whfast512->p_jh->vy[s*p_per_system+i-1]; - ri_whfast->p_jh[i].vz = ri_whfast512->p_jh->vz[s*p_per_system+i-1]; - } - reb_integrator_whfast_synchronize(rs); - for (unsigned int i=0;iparticles[s*N_per_system+i] = rs->particles[i]; - r->particles[s*N_per_system+i].sim = r; - } - reb_simulation_free(rs); } - + reb_whfast512_com_step(r, dt/2.0); // does not use AVX512 + democraticheliocentric_to_inertial_posvel(r); ri_whfast512->is_synchronized = 1; -#endif // AVX512 } } diff --git a/src/integrator_whfast512.h b/src/integrator_whfast512.h index 716a0eeff..17a47e910 100644 --- a/src/integrator_whfast512.h +++ b/src/integrator_whfast512.h @@ -32,6 +32,7 @@ void reb_integrator_whfast512_reset(struct reb_simulation* r); void reb_integrator_whfast512_part1(struct reb_simulation* r); ///< Internal function used to call a specific integrator void reb_integrator_whfast512_part2(struct reb_simulation* r); ///< Internal function used to call a specific integrator void reb_integrator_whfast512_synchronize(struct reb_simulation* r); ///< Internal function used to call a specific integrator +void reb_integrator_whfast512_synchronize_fallback(struct reb_simulation* const r); // Internal function. void reb_whfast512_kepler_solver(const struct reb_simulation* const r, struct reb_particle* const restrict p_j, const double M, unsigned int i, double _dt); ///< Internal function (Main WHFast Kepler Solver) void reb_whfast512_calculate_jerk(struct reb_simulation* r); ///< Calculates "jerk" term