parallelize CG solver, lower acc for speed
Some checks failed
studiorailgun/Renderer/pipeline/head There was a failure building this commit

This commit is contained in:
austin 2024-12-11 23:47:52 -05:00
parent 7c13bf6d99
commit a8c879e384
6 changed files with 226 additions and 86 deletions

View File

@ -10,12 +10,12 @@
/**
* The number of times to relax most solvers
*/
#define FLUID_GRID2_SOLVER_MULTIGRID_MAX_ITERATIONS 20
#define FLUID_GRID2_SOLVER_MULTIGRID_MAX_ITERATIONS 5
/**
* Tolerance to target for multigrid precomputing in projection
*/
#define FLUID_GRID2_SOLVER_MULTIGRID_TOLERANCE 0.005f
#define FLUID_GRID2_SOLVER_MULTIGRID_TOLERANCE 1.0f
/**
* The number of times to relax most solvers
@ -25,7 +25,7 @@
/**
* Tolerance to target for conjugate gradient precomputing in projection
*/
#define FLUID_GRID2_SOLVER_CG_TOLERANCE 0.001f
#define FLUID_GRID2_SOLVER_CG_TOLERANCE 0.1f
/**
* Width of a single grid cell
@ -50,7 +50,7 @@
/**
* The tolerance for convergence of the projection operator
*/
#define FLUID_GRID2_PROJECTION_CONVERGENCE_TOLERANCE 0.005f
#define FLUID_GRID2_PROJECTION_CONVERGENCE_TOLERANCE FLUID_GRID2_SOLVER_CG_TOLERANCE
/**
* Diffusion constant

View File

@ -12,15 +12,6 @@
*/
int solver_conjugate_gradient_init(float * phi, float * phi0, float a, float c);
/**
* Iteratively solves an ODE matrix by 1 iteration of conjugate gradient method
* @param phi The phi array
* @param phi0 The phi array from the last frame
* @param a The a const
* @param c The c const
*/
void solver_conjugate_gradient_iterate_parallel(float * phi, float * phi0, float a, float c);
/**
* Initializes the conjugate gradient solver
* @param phi The phi array
@ -36,7 +27,18 @@ void solver_conjugate_gradient_init_serial(float * phi, float * phi0, float a, f
* @param phi0 The phi array from the last frame
* @param a The a const
* @param c The c const
* @return The residual
*/
float solver_conjugate_gradient_iterate_serial(float * phi, float * phi0, float a, float c);
/**
* Iteratively solves an ODE matrix by 1 iteration of conjugate gradient method
* @param phi The phi array
* @param phi0 The phi array from the last frame
* @param a The a const
* @param c The c const
* @return The residual
*/
float solver_conjugate_gradient_iterate_parallel(float * phi, float * phi0, float a, float c);
#endif

View File

@ -196,16 +196,17 @@ LIBRARY_API void fluid_grid2_solveProjection(
// }
//init CG solver
// solver_conjugate_gradient_init_serial(p,div,a,c);
// //solve with CG
// while(chunk->projectionIterations < FLUID_GRID2_SOLVER_CG_MAX_ITERATIONS && (chunk->projectionResidual > FLUID_GRID2_SOLVER_CG_TOLERANCE || chunk->projectionResidual < -FLUID_GRID2_SOLVER_CG_TOLERANCE)){
// chunk->projectionResidual = solver_conjugate_gradient_iterate_serial(p,div,a,c);;
// fluid_grid2_set_bounds(FLUID_GRID2_BOUND_NO_DIR,p);
// chunk->projectionIterations++;
// }
// if(chunk->projectionResidual > FLUID_GRID2_SOLVER_CG_TOLERANCE || chunk->projectionResidual < -FLUID_GRID2_SOLVER_CG_TOLERANCE){
// // printf("Projection residual didn't converge! %f \n",residual);
// }
solver_conjugate_gradient_init_serial(p,div,a,c);
chunk->projectionIterations = 0;
//solve with CG
while(chunk->projectionIterations < FLUID_GRID2_SOLVER_CG_MAX_ITERATIONS && (chunk->projectionResidual > FLUID_GRID2_SOLVER_CG_TOLERANCE || chunk->projectionResidual < -FLUID_GRID2_SOLVER_CG_TOLERANCE)){
chunk->projectionResidual = solver_conjugate_gradient_iterate_parallel(p,div,a,c);;
fluid_grid2_set_bounds(FLUID_GRID2_BOUND_NO_DIR,p);
chunk->projectionIterations++;
}
if(chunk->projectionResidual > FLUID_GRID2_SOLVER_CG_TOLERANCE || chunk->projectionResidual < -FLUID_GRID2_SOLVER_CG_TOLERANCE){
// printf("Projection residual didn't converge! %f \n",chunk->projectionResidual);
}
//finest grain iteration with conjugate gradient method
// int shouldSolve = solver_conjugate_gradient_init(p,div,a,c);

View File

@ -5,6 +5,7 @@
#include "fluid/queue/chunk.h"
#include "fluid/sim/grid2/solver_consts.h"
#include "math/ode/ode_utils.h"
#include "math/ode/gauss_seidel.h"
/**
@ -115,73 +116,209 @@ int solver_conjugate_gradient_init(float * phi, float * phi0, float a, float c){
* @param a The a const
* @param c The c const
*/
void solver_conjugate_gradient_iterate_parallel(float * phi, float * phi0, float a, float c){
float solver_conjugate_gradient_iterate_parallel(float * phi, float * phi0, float a, float c){
int i, j, k;
__m256 aScalar = _mm256_set1_ps(a);
__m256 cScalar = _mm256_set1_ps(c);
__m256 epsilonScalar = _mm256_set1_ps(CONJUGATE_GRADIENT_EPSILON);
__m256 stencil;
__m256 denominator, r_old_squared, r_new_squared, a_k, r_delta, beta, p_old, p_new, phi_old, phi_new, r_old, r_new;
__m256 nanMask, nanComp;
__m256i iMask0, iMask1;
//iniitalize the r (residual) and p (search direction) arrays
__m256 constVec = _mm256_set1_ps(6);
__m256 convergenceVec, denominatorVec;
__m256 alphaVec;
__m256 rdotVec;
__m256 rVec;
__m256 betaVec;
float vec_sum_storage[8];
float convergence, denominator;
float laplacian, alpha, r_new_dot, beta;
//solve Ap
for(k=1; k<DIM-1; k++){
for(j=1; j<DIM-1; j++){
//solve as much as possible vectorized
// for(i = 1; i < DIM-1; i++){
// laplacian =
// (
// 6 * p[ode_index (i , j , k ,DIM)] +
// - (
// p[ode_index (i+1 , j , k ,DIM)] +
// p[ode_index (i-1 , j , k ,DIM)] +
// p[ode_index (i , j+1 , k ,DIM)] +
// p[ode_index (i , j-1 , k ,DIM)] +
// p[ode_index (i , j , k+1 ,DIM)] +
// p[ode_index (i , j , k-1 ,DIM)]
// )
// );
// A[ode_index(i,j,k,DIM)] = laplacian;
// }
for(i = 1; i < DIM-1; i=i+8){
//calculate the stencil applied to p
//get values from neighbors
stencil = _mm256_loadu_ps(&p[ode_index( i-1, j, k, DIM )]);
stencil = _mm256_add_ps(stencil,_mm256_loadu_ps(&p[ode_index( i+1, j, k, DIM )]));
stencil = _mm256_add_ps(stencil,_mm256_loadu_ps(&p[ode_index( i, j-1, k, DIM )]));
stencil = _mm256_add_ps(stencil,_mm256_loadu_ps(&p[ode_index( i, j+1, k, DIM )]));
stencil = _mm256_add_ps(stencil,_mm256_loadu_ps(&p[ode_index( i, j, k-1, DIM )]));
stencil = _mm256_add_ps(stencil,_mm256_loadu_ps(&p[ode_index( i, j, k+1, DIM )]));
//multiply by a
stencil = _mm256_mul_ps(stencil,aScalar);
//add previous value
stencil = _mm256_add_ps(stencil,_mm256_loadu_ps(&p[ode_index(i,j,k,DIM)]));
//divide by 6
stencil = _mm256_div_ps(stencil,cScalar);
//load p0
p_old = _mm256_loadu_ps(&p[ode_index(i,j,k,DIM)]);
//calculate ak
denominator = _mm256_mul_ps(stencil,p_old);
denominator = _mm256_add_ps(denominator,epsilonScalar);
r_old_squared = _mm256_mul_ps(_mm256_loadu_ps(&r[ode_index(i,j,k,DIM)]),_mm256_loadu_ps(&r[ode_index(i,j,k,DIM)]));
a_k = _mm256_div_ps(r_old_squared,denominator);
//update phi
phi_old = _mm256_loadu_ps(&phi[ode_index(i,j,k,DIM)]);
phi_new = _mm256_mul_ps(p_old,a_k);
phi_new = _mm256_add_ps(phi_new,phi_old);
//store new phi
_mm256_storeu_ps(&phi[ode_index(i,j,k,DIM)],phi_new);
//update residual
r_delta = _mm256_add_ps(stencil,a_k);
r_old = _mm256_loadu_ps(&r[ode_index(i,j,k,DIM)]);
r_new = _mm256_sub_ps(r_old,r_delta);
_mm256_storeu_ps(&r[ode_index(i,j,k,DIM)],r_new);
//calculate beta
r_new_squared = _mm256_mul_ps(r_new,r_new);
denominator = _mm256_add_ps(r_old_squared,epsilonScalar);
beta = _mm256_div_ps(r_new_squared,denominator);
//update p (search dir)
p_new = _mm256_mul_ps(beta,p_old);
p_new = _mm256_add_ps(r_new,p_new);
_mm256_storeu_ps(&p[ode_index(i,j,k,DIM)],p_new);
_mm256_storeu_ps(
&A[ode_index(i,j,k,DIM)],
_mm256_sub_ps(
_mm256_mul_ps(
_mm256_loadu_ps(&p[solver_gauss_seidel_get_index(i,j,k,DIM)]),
constVec
),
_mm256_add_ps(
_mm256_add_ps(
_mm256_add_ps(
_mm256_loadu_ps(&p[solver_gauss_seidel_get_index(i-1,j,k,DIM)]),
_mm256_loadu_ps(&p[solver_gauss_seidel_get_index(i+1,j,k,DIM)])
),
_mm256_add_ps(
_mm256_loadu_ps(&p[solver_gauss_seidel_get_index(i,j-1,k,DIM)]),
_mm256_loadu_ps(&p[solver_gauss_seidel_get_index(i,j+1,k,DIM)])
)
),
_mm256_add_ps(
_mm256_loadu_ps(&p[solver_gauss_seidel_get_index(i,j,k-1,DIM)]),
_mm256_loadu_ps(&p[solver_gauss_seidel_get_index(i,j,k+1,DIM)])
)
)
)
);
}
}
}
convergenceVec = _mm256_setzero_ps();
denominatorVec = _mm256_set1_ps(CONJUGATE_GRADIENT_EPSILON);
convergence = 0;
denominator = CONJUGATE_GRADIENT_EPSILON;
for(k=1; k<DIM-1; k++){
for(j=1; j<DIM-1; j++){
// for(i = 1; i < DIM-1; i++){
// convergence = convergence + r[ode_index(i,j,k,DIM)] * r[ode_index(i,j,k,DIM)];
// // denominator = denominator + p[ode_index(i,j,k,DIM)] * A[ode_index(i,j,k,DIM)];
// }
for(i = 1; i < DIM-1; i=i+8){
//convergence = convergence + r[ode_index(i,j,k,DIM)] * r[ode_index(i,j,k,DIM)];
rVec = _mm256_loadu_ps(&r[solver_gauss_seidel_get_index(i,j,k,DIM)]);
rdotVec = _mm256_mul_ps(
rVec,
rVec
);
convergenceVec = _mm256_add_ps(
convergenceVec,
rdotVec
);
//denominator = denominator + p[ode_index(i,j,k,DIM)] * A[ode_index(i,j,k,DIM)];
denominatorVec = _mm256_add_ps(
denominatorVec,
_mm256_mul_ps(
_mm256_loadu_ps(&A[solver_gauss_seidel_get_index(i,j,k,DIM)]),
_mm256_loadu_ps(&p[solver_gauss_seidel_get_index(i,j,k,DIM)])
)
);
}
}
}
//collect convergence
_mm256_storeu_ps(vec_sum_storage,convergenceVec);
convergence =
vec_sum_storage[0] + vec_sum_storage[1] +
vec_sum_storage[2] + vec_sum_storage[3] +
vec_sum_storage[4] + vec_sum_storage[5] +
vec_sum_storage[6] + vec_sum_storage[7]
;
//collect denominator
_mm256_storeu_ps(vec_sum_storage,denominatorVec);
denominator =
vec_sum_storage[0] + vec_sum_storage[1] +
vec_sum_storage[2] + vec_sum_storage[3] +
vec_sum_storage[4] + vec_sum_storage[5] +
vec_sum_storage[6] + vec_sum_storage[7]
;
//have hit the desired level of convergence
if(convergence < CONJUGATE_GRADIENT_EPSILON && convergence > -CONJUGATE_GRADIENT_EPSILON){
return 0.0f;
}
//error check
if(denominator < CONJUGATE_GRADIENT_EPSILON && denominator > -CONJUGATE_GRADIENT_EPSILON){
printf("Divide by 0! %f \n", denominator);
fflush(stdout);
}
alpha = convergence / denominator;
r_new_dot = 0;
alphaVec = _mm256_set1_ps(alpha);
for(k=1; k<DIM-1; k++){
for(j=1; j<DIM-1; j++){
// for(i = 1; i < DIM-1; i++){
// phi[ode_index(i,j,k,DIM)] = phi[ode_index(i,j,k,DIM)] + alpha * p[ode_index(i,j,k,DIM)];
// r[ode_index(i,j,k,DIM)] = r[ode_index(i,j,k,DIM)] - alpha * A[ode_index(i,j,k,DIM)];
// r_new_dot = r_new_dot + r[ode_index(i,j,k,DIM)] * r[ode_index(i,j,k,DIM)];
// }
for(i = 1; i < DIM-1; i=i+8){
//phi[ode_index(i,j,k,DIM)] = phi[ode_index(i,j,k,DIM)] + alpha * p[ode_index(i,j,k,DIM)];
_mm256_storeu_ps(
&phi[ode_index(i,j,k,DIM)],
_mm256_add_ps(
_mm256_loadu_ps(&phi[solver_gauss_seidel_get_index(i,j,k,DIM)]),
_mm256_mul_ps(
alphaVec,
_mm256_loadu_ps(&p[solver_gauss_seidel_get_index(i,j,k,DIM)])
)
)
);
// r[ode_index(i,j,k,DIM)] = r[ode_index(i,j,k,DIM)] - alpha * A[ode_index(i,j,k,DIM)];
_mm256_storeu_ps(
&r[ode_index(i,j,k,DIM)],
_mm256_sub_ps(
_mm256_loadu_ps(&r[solver_gauss_seidel_get_index(i,j,k,DIM)]),
_mm256_mul_ps(
alphaVec,
_mm256_loadu_ps(&A[solver_gauss_seidel_get_index(i,j,k,DIM)])
)
)
);
// r_new_dot = r_new_dot + r[ode_index(i,j,k,DIM)] * r[ode_index(i,j,k,DIM)];
rdotVec = _mm256_add_ps(
rdotVec,
_mm256_mul_ps(
_mm256_loadu_ps(&r[solver_gauss_seidel_get_index(i,j,k,DIM)]),
_mm256_loadu_ps(&r[solver_gauss_seidel_get_index(i,j,k,DIM)])
)
);
}
}
}
//collect rdot
_mm256_storeu_ps(vec_sum_storage,rdotVec);
r_new_dot =
vec_sum_storage[0] + vec_sum_storage[1] +
vec_sum_storage[2] + vec_sum_storage[3] +
vec_sum_storage[4] + vec_sum_storage[5] +
vec_sum_storage[6] + vec_sum_storage[7]
;
//calculate beta
beta = r_new_dot / convergence;
betaVec = _mm256_set1_ps(beta);
//update p
for(k=1; k<DIM-1; k++){
for(j=1; j<DIM-1; j++){
// for(i = 1; i < DIM-1; i++){
// p[ode_index(i,j,k,DIM)] = r[ode_index(i,j,k,DIM)] + beta * p[ode_index(i,j,k,DIM)];
// }
for(i = 1; i < DIM-1; i=i+8){
//p[ode_index(i,j,k,DIM)] = r[ode_index(i,j,k,DIM)] + beta * p[ode_index(i,j,k,DIM)];
_mm256_storeu_ps(
&p[ode_index(i,j,k,DIM)],
_mm256_add_ps(
_mm256_loadu_ps(&r[solver_gauss_seidel_get_index(i,j,k,DIM)]),
_mm256_mul_ps(
betaVec,
_mm256_loadu_ps(&p[solver_gauss_seidel_get_index(i,j,k,DIM)])
)
)
);
}
}
}
return (float)sqrt(convergence);
}
/**

View File

@ -22,7 +22,7 @@
/**
* Error margin for tests
*/
#define FLUID_GRID2_PROJECTION_ERROR_MARGIN 0.00001f
#define FLUID_GRID2_PROJECTION_ERROR_MARGIN 0.005f
/**
* Maximum convergence we want to see in any chunk

View File

@ -229,9 +229,9 @@ int fluid_sim_grid2_speed_tests(){
fluid_grid2_allocate_arrays();
// rVal += fluid_sim_grid2_speed_test1();
// rVal += fluid_sim_grid2_speed_test2();
// rVal += fluid_sim_grid2_speed_test3();
rVal += fluid_sim_grid2_speed_test1();
rVal += fluid_sim_grid2_speed_test2();
rVal += fluid_sim_grid2_speed_test3();
return rVal;
}