parallelize CG solver, lower acc for speed
Some checks failed
studiorailgun/Renderer/pipeline/head There was a failure building this commit
Some checks failed
studiorailgun/Renderer/pipeline/head There was a failure building this commit
This commit is contained in:
parent
7c13bf6d99
commit
a8c879e384
@ -10,12 +10,12 @@
|
||||
/**
|
||||
* The number of times to relax most solvers
|
||||
*/
|
||||
#define FLUID_GRID2_SOLVER_MULTIGRID_MAX_ITERATIONS 20
|
||||
#define FLUID_GRID2_SOLVER_MULTIGRID_MAX_ITERATIONS 5
|
||||
|
||||
/**
|
||||
* Tolerance to target for multigrid precomputing in projection
|
||||
*/
|
||||
#define FLUID_GRID2_SOLVER_MULTIGRID_TOLERANCE 0.005f
|
||||
#define FLUID_GRID2_SOLVER_MULTIGRID_TOLERANCE 1.0f
|
||||
|
||||
/**
|
||||
* The number of times to relax most solvers
|
||||
@ -25,7 +25,7 @@
|
||||
/**
|
||||
* Tolerance to target for conjugate gradient precomputing in projection
|
||||
*/
|
||||
#define FLUID_GRID2_SOLVER_CG_TOLERANCE 0.001f
|
||||
#define FLUID_GRID2_SOLVER_CG_TOLERANCE 0.1f
|
||||
|
||||
/**
|
||||
* Width of a single grid cell
|
||||
@ -50,7 +50,7 @@
|
||||
/**
|
||||
* The tolerance for convergence of the projection operator
|
||||
*/
|
||||
#define FLUID_GRID2_PROJECTION_CONVERGENCE_TOLERANCE 0.005f
|
||||
#define FLUID_GRID2_PROJECTION_CONVERGENCE_TOLERANCE FLUID_GRID2_SOLVER_CG_TOLERANCE
|
||||
|
||||
/**
|
||||
* Diffusion constant
|
||||
|
||||
@ -12,15 +12,6 @@
|
||||
*/
|
||||
int solver_conjugate_gradient_init(float * phi, float * phi0, float a, float c);
|
||||
|
||||
/**
|
||||
* Iteratively solves an ODE matrix by 1 iteration of conjugate gradient method
|
||||
* @param phi The phi array
|
||||
* @param phi0 The phi array from the last frame
|
||||
* @param a The a const
|
||||
* @param c The c const
|
||||
*/
|
||||
void solver_conjugate_gradient_iterate_parallel(float * phi, float * phi0, float a, float c);
|
||||
|
||||
/**
|
||||
* Initializes the conjugate gradient solver
|
||||
* @param phi The phi array
|
||||
@ -36,7 +27,18 @@ void solver_conjugate_gradient_init_serial(float * phi, float * phi0, float a, f
|
||||
* @param phi0 The phi array from the last frame
|
||||
* @param a The a const
|
||||
* @param c The c const
|
||||
* @return The residual
|
||||
*/
|
||||
float solver_conjugate_gradient_iterate_serial(float * phi, float * phi0, float a, float c);
|
||||
|
||||
/**
|
||||
* Iteratively solves an ODE matrix by 1 iteration of conjugate gradient method
|
||||
* @param phi The phi array
|
||||
* @param phi0 The phi array from the last frame
|
||||
* @param a The a const
|
||||
* @param c The c const
|
||||
* @return The residual
|
||||
*/
|
||||
float solver_conjugate_gradient_iterate_parallel(float * phi, float * phi0, float a, float c);
|
||||
|
||||
#endif
|
||||
@ -196,16 +196,17 @@ LIBRARY_API void fluid_grid2_solveProjection(
|
||||
// }
|
||||
|
||||
//init CG solver
|
||||
// solver_conjugate_gradient_init_serial(p,div,a,c);
|
||||
// //solve with CG
|
||||
// while(chunk->projectionIterations < FLUID_GRID2_SOLVER_CG_MAX_ITERATIONS && (chunk->projectionResidual > FLUID_GRID2_SOLVER_CG_TOLERANCE || chunk->projectionResidual < -FLUID_GRID2_SOLVER_CG_TOLERANCE)){
|
||||
// chunk->projectionResidual = solver_conjugate_gradient_iterate_serial(p,div,a,c);;
|
||||
// fluid_grid2_set_bounds(FLUID_GRID2_BOUND_NO_DIR,p);
|
||||
// chunk->projectionIterations++;
|
||||
// }
|
||||
// if(chunk->projectionResidual > FLUID_GRID2_SOLVER_CG_TOLERANCE || chunk->projectionResidual < -FLUID_GRID2_SOLVER_CG_TOLERANCE){
|
||||
// // printf("Projection residual didn't converge! %f \n",residual);
|
||||
// }
|
||||
solver_conjugate_gradient_init_serial(p,div,a,c);
|
||||
chunk->projectionIterations = 0;
|
||||
//solve with CG
|
||||
while(chunk->projectionIterations < FLUID_GRID2_SOLVER_CG_MAX_ITERATIONS && (chunk->projectionResidual > FLUID_GRID2_SOLVER_CG_TOLERANCE || chunk->projectionResidual < -FLUID_GRID2_SOLVER_CG_TOLERANCE)){
|
||||
chunk->projectionResidual = solver_conjugate_gradient_iterate_parallel(p,div,a,c);;
|
||||
fluid_grid2_set_bounds(FLUID_GRID2_BOUND_NO_DIR,p);
|
||||
chunk->projectionIterations++;
|
||||
}
|
||||
if(chunk->projectionResidual > FLUID_GRID2_SOLVER_CG_TOLERANCE || chunk->projectionResidual < -FLUID_GRID2_SOLVER_CG_TOLERANCE){
|
||||
// printf("Projection residual didn't converge! %f \n",chunk->projectionResidual);
|
||||
}
|
||||
|
||||
//finest grain iteration with conjugate gradient method
|
||||
// int shouldSolve = solver_conjugate_gradient_init(p,div,a,c);
|
||||
|
||||
@ -5,6 +5,7 @@
|
||||
#include "fluid/queue/chunk.h"
|
||||
#include "fluid/sim/grid2/solver_consts.h"
|
||||
#include "math/ode/ode_utils.h"
|
||||
#include "math/ode/gauss_seidel.h"
|
||||
|
||||
|
||||
/**
|
||||
@ -115,73 +116,209 @@ int solver_conjugate_gradient_init(float * phi, float * phi0, float a, float c){
|
||||
* @param a The a const
|
||||
* @param c The c const
|
||||
*/
|
||||
void solver_conjugate_gradient_iterate_parallel(float * phi, float * phi0, float a, float c){
|
||||
float solver_conjugate_gradient_iterate_parallel(float * phi, float * phi0, float a, float c){
|
||||
int i, j, k;
|
||||
|
||||
__m256 aScalar = _mm256_set1_ps(a);
|
||||
__m256 cScalar = _mm256_set1_ps(c);
|
||||
__m256 epsilonScalar = _mm256_set1_ps(CONJUGATE_GRADIENT_EPSILON);
|
||||
__m256 stencil;
|
||||
__m256 denominator, r_old_squared, r_new_squared, a_k, r_delta, beta, p_old, p_new, phi_old, phi_new, r_old, r_new;
|
||||
__m256 nanMask, nanComp;
|
||||
__m256i iMask0, iMask1;
|
||||
|
||||
//iniitalize the r (residual) and p (search direction) arrays
|
||||
__m256 constVec = _mm256_set1_ps(6);
|
||||
__m256 convergenceVec, denominatorVec;
|
||||
__m256 alphaVec;
|
||||
__m256 rdotVec;
|
||||
__m256 rVec;
|
||||
__m256 betaVec;
|
||||
float vec_sum_storage[8];
|
||||
float convergence, denominator;
|
||||
float laplacian, alpha, r_new_dot, beta;
|
||||
//solve Ap
|
||||
for(k=1; k<DIM-1; k++){
|
||||
for(j=1; j<DIM-1; j++){
|
||||
//solve as much as possible vectorized
|
||||
// for(i = 1; i < DIM-1; i++){
|
||||
// laplacian =
|
||||
// (
|
||||
// 6 * p[ode_index (i , j , k ,DIM)] +
|
||||
// - (
|
||||
// p[ode_index (i+1 , j , k ,DIM)] +
|
||||
// p[ode_index (i-1 , j , k ,DIM)] +
|
||||
// p[ode_index (i , j+1 , k ,DIM)] +
|
||||
// p[ode_index (i , j-1 , k ,DIM)] +
|
||||
// p[ode_index (i , j , k+1 ,DIM)] +
|
||||
// p[ode_index (i , j , k-1 ,DIM)]
|
||||
// )
|
||||
// );
|
||||
// A[ode_index(i,j,k,DIM)] = laplacian;
|
||||
// }
|
||||
for(i = 1; i < DIM-1; i=i+8){
|
||||
//calculate the stencil applied to p
|
||||
//get values from neighbors
|
||||
stencil = _mm256_loadu_ps(&p[ode_index( i-1, j, k, DIM )]);
|
||||
stencil = _mm256_add_ps(stencil,_mm256_loadu_ps(&p[ode_index( i+1, j, k, DIM )]));
|
||||
stencil = _mm256_add_ps(stencil,_mm256_loadu_ps(&p[ode_index( i, j-1, k, DIM )]));
|
||||
stencil = _mm256_add_ps(stencil,_mm256_loadu_ps(&p[ode_index( i, j+1, k, DIM )]));
|
||||
stencil = _mm256_add_ps(stencil,_mm256_loadu_ps(&p[ode_index( i, j, k-1, DIM )]));
|
||||
stencil = _mm256_add_ps(stencil,_mm256_loadu_ps(&p[ode_index( i, j, k+1, DIM )]));
|
||||
//multiply by a
|
||||
stencil = _mm256_mul_ps(stencil,aScalar);
|
||||
//add previous value
|
||||
stencil = _mm256_add_ps(stencil,_mm256_loadu_ps(&p[ode_index(i,j,k,DIM)]));
|
||||
//divide by 6
|
||||
stencil = _mm256_div_ps(stencil,cScalar);
|
||||
|
||||
//load p0
|
||||
p_old = _mm256_loadu_ps(&p[ode_index(i,j,k,DIM)]);
|
||||
|
||||
//calculate ak
|
||||
denominator = _mm256_mul_ps(stencil,p_old);
|
||||
denominator = _mm256_add_ps(denominator,epsilonScalar);
|
||||
r_old_squared = _mm256_mul_ps(_mm256_loadu_ps(&r[ode_index(i,j,k,DIM)]),_mm256_loadu_ps(&r[ode_index(i,j,k,DIM)]));
|
||||
a_k = _mm256_div_ps(r_old_squared,denominator);
|
||||
|
||||
|
||||
//update phi
|
||||
phi_old = _mm256_loadu_ps(&phi[ode_index(i,j,k,DIM)]);
|
||||
phi_new = _mm256_mul_ps(p_old,a_k);
|
||||
phi_new = _mm256_add_ps(phi_new,phi_old);
|
||||
|
||||
//store new phi
|
||||
_mm256_storeu_ps(&phi[ode_index(i,j,k,DIM)],phi_new);
|
||||
|
||||
//update residual
|
||||
r_delta = _mm256_add_ps(stencil,a_k);
|
||||
r_old = _mm256_loadu_ps(&r[ode_index(i,j,k,DIM)]);
|
||||
r_new = _mm256_sub_ps(r_old,r_delta);
|
||||
_mm256_storeu_ps(&r[ode_index(i,j,k,DIM)],r_new);
|
||||
|
||||
//calculate beta
|
||||
r_new_squared = _mm256_mul_ps(r_new,r_new);
|
||||
denominator = _mm256_add_ps(r_old_squared,epsilonScalar);
|
||||
beta = _mm256_div_ps(r_new_squared,denominator);
|
||||
|
||||
//update p (search dir)
|
||||
p_new = _mm256_mul_ps(beta,p_old);
|
||||
p_new = _mm256_add_ps(r_new,p_new);
|
||||
_mm256_storeu_ps(&p[ode_index(i,j,k,DIM)],p_new);
|
||||
_mm256_storeu_ps(
|
||||
&A[ode_index(i,j,k,DIM)],
|
||||
_mm256_sub_ps(
|
||||
_mm256_mul_ps(
|
||||
_mm256_loadu_ps(&p[solver_gauss_seidel_get_index(i,j,k,DIM)]),
|
||||
constVec
|
||||
),
|
||||
_mm256_add_ps(
|
||||
_mm256_add_ps(
|
||||
_mm256_add_ps(
|
||||
_mm256_loadu_ps(&p[solver_gauss_seidel_get_index(i-1,j,k,DIM)]),
|
||||
_mm256_loadu_ps(&p[solver_gauss_seidel_get_index(i+1,j,k,DIM)])
|
||||
),
|
||||
_mm256_add_ps(
|
||||
_mm256_loadu_ps(&p[solver_gauss_seidel_get_index(i,j-1,k,DIM)]),
|
||||
_mm256_loadu_ps(&p[solver_gauss_seidel_get_index(i,j+1,k,DIM)])
|
||||
)
|
||||
),
|
||||
_mm256_add_ps(
|
||||
_mm256_loadu_ps(&p[solver_gauss_seidel_get_index(i,j,k-1,DIM)]),
|
||||
_mm256_loadu_ps(&p[solver_gauss_seidel_get_index(i,j,k+1,DIM)])
|
||||
)
|
||||
)
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
convergenceVec = _mm256_setzero_ps();
|
||||
denominatorVec = _mm256_set1_ps(CONJUGATE_GRADIENT_EPSILON);
|
||||
convergence = 0;
|
||||
denominator = CONJUGATE_GRADIENT_EPSILON;
|
||||
for(k=1; k<DIM-1; k++){
|
||||
for(j=1; j<DIM-1; j++){
|
||||
// for(i = 1; i < DIM-1; i++){
|
||||
// convergence = convergence + r[ode_index(i,j,k,DIM)] * r[ode_index(i,j,k,DIM)];
|
||||
// // denominator = denominator + p[ode_index(i,j,k,DIM)] * A[ode_index(i,j,k,DIM)];
|
||||
// }
|
||||
for(i = 1; i < DIM-1; i=i+8){
|
||||
//convergence = convergence + r[ode_index(i,j,k,DIM)] * r[ode_index(i,j,k,DIM)];
|
||||
rVec = _mm256_loadu_ps(&r[solver_gauss_seidel_get_index(i,j,k,DIM)]);
|
||||
rdotVec = _mm256_mul_ps(
|
||||
rVec,
|
||||
rVec
|
||||
);
|
||||
convergenceVec = _mm256_add_ps(
|
||||
convergenceVec,
|
||||
rdotVec
|
||||
);
|
||||
//denominator = denominator + p[ode_index(i,j,k,DIM)] * A[ode_index(i,j,k,DIM)];
|
||||
denominatorVec = _mm256_add_ps(
|
||||
denominatorVec,
|
||||
_mm256_mul_ps(
|
||||
_mm256_loadu_ps(&A[solver_gauss_seidel_get_index(i,j,k,DIM)]),
|
||||
_mm256_loadu_ps(&p[solver_gauss_seidel_get_index(i,j,k,DIM)])
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//collect convergence
|
||||
_mm256_storeu_ps(vec_sum_storage,convergenceVec);
|
||||
convergence =
|
||||
vec_sum_storage[0] + vec_sum_storage[1] +
|
||||
vec_sum_storage[2] + vec_sum_storage[3] +
|
||||
vec_sum_storage[4] + vec_sum_storage[5] +
|
||||
vec_sum_storage[6] + vec_sum_storage[7]
|
||||
;
|
||||
|
||||
//collect denominator
|
||||
_mm256_storeu_ps(vec_sum_storage,denominatorVec);
|
||||
denominator =
|
||||
vec_sum_storage[0] + vec_sum_storage[1] +
|
||||
vec_sum_storage[2] + vec_sum_storage[3] +
|
||||
vec_sum_storage[4] + vec_sum_storage[5] +
|
||||
vec_sum_storage[6] + vec_sum_storage[7]
|
||||
;
|
||||
|
||||
//have hit the desired level of convergence
|
||||
if(convergence < CONJUGATE_GRADIENT_EPSILON && convergence > -CONJUGATE_GRADIENT_EPSILON){
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
//error check
|
||||
if(denominator < CONJUGATE_GRADIENT_EPSILON && denominator > -CONJUGATE_GRADIENT_EPSILON){
|
||||
printf("Divide by 0! %f \n", denominator);
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
|
||||
alpha = convergence / denominator;
|
||||
r_new_dot = 0;
|
||||
alphaVec = _mm256_set1_ps(alpha);
|
||||
|
||||
for(k=1; k<DIM-1; k++){
|
||||
for(j=1; j<DIM-1; j++){
|
||||
// for(i = 1; i < DIM-1; i++){
|
||||
// phi[ode_index(i,j,k,DIM)] = phi[ode_index(i,j,k,DIM)] + alpha * p[ode_index(i,j,k,DIM)];
|
||||
// r[ode_index(i,j,k,DIM)] = r[ode_index(i,j,k,DIM)] - alpha * A[ode_index(i,j,k,DIM)];
|
||||
// r_new_dot = r_new_dot + r[ode_index(i,j,k,DIM)] * r[ode_index(i,j,k,DIM)];
|
||||
// }
|
||||
for(i = 1; i < DIM-1; i=i+8){
|
||||
//phi[ode_index(i,j,k,DIM)] = phi[ode_index(i,j,k,DIM)] + alpha * p[ode_index(i,j,k,DIM)];
|
||||
_mm256_storeu_ps(
|
||||
&phi[ode_index(i,j,k,DIM)],
|
||||
_mm256_add_ps(
|
||||
_mm256_loadu_ps(&phi[solver_gauss_seidel_get_index(i,j,k,DIM)]),
|
||||
_mm256_mul_ps(
|
||||
alphaVec,
|
||||
_mm256_loadu_ps(&p[solver_gauss_seidel_get_index(i,j,k,DIM)])
|
||||
)
|
||||
)
|
||||
);
|
||||
// r[ode_index(i,j,k,DIM)] = r[ode_index(i,j,k,DIM)] - alpha * A[ode_index(i,j,k,DIM)];
|
||||
_mm256_storeu_ps(
|
||||
&r[ode_index(i,j,k,DIM)],
|
||||
_mm256_sub_ps(
|
||||
_mm256_loadu_ps(&r[solver_gauss_seidel_get_index(i,j,k,DIM)]),
|
||||
_mm256_mul_ps(
|
||||
alphaVec,
|
||||
_mm256_loadu_ps(&A[solver_gauss_seidel_get_index(i,j,k,DIM)])
|
||||
)
|
||||
)
|
||||
);
|
||||
// r_new_dot = r_new_dot + r[ode_index(i,j,k,DIM)] * r[ode_index(i,j,k,DIM)];
|
||||
rdotVec = _mm256_add_ps(
|
||||
rdotVec,
|
||||
_mm256_mul_ps(
|
||||
_mm256_loadu_ps(&r[solver_gauss_seidel_get_index(i,j,k,DIM)]),
|
||||
_mm256_loadu_ps(&r[solver_gauss_seidel_get_index(i,j,k,DIM)])
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//collect rdot
|
||||
_mm256_storeu_ps(vec_sum_storage,rdotVec);
|
||||
r_new_dot =
|
||||
vec_sum_storage[0] + vec_sum_storage[1] +
|
||||
vec_sum_storage[2] + vec_sum_storage[3] +
|
||||
vec_sum_storage[4] + vec_sum_storage[5] +
|
||||
vec_sum_storage[6] + vec_sum_storage[7]
|
||||
;
|
||||
|
||||
//calculate beta
|
||||
beta = r_new_dot / convergence;
|
||||
betaVec = _mm256_set1_ps(beta);
|
||||
|
||||
//update p
|
||||
for(k=1; k<DIM-1; k++){
|
||||
for(j=1; j<DIM-1; j++){
|
||||
// for(i = 1; i < DIM-1; i++){
|
||||
// p[ode_index(i,j,k,DIM)] = r[ode_index(i,j,k,DIM)] + beta * p[ode_index(i,j,k,DIM)];
|
||||
// }
|
||||
for(i = 1; i < DIM-1; i=i+8){
|
||||
//p[ode_index(i,j,k,DIM)] = r[ode_index(i,j,k,DIM)] + beta * p[ode_index(i,j,k,DIM)];
|
||||
_mm256_storeu_ps(
|
||||
&p[ode_index(i,j,k,DIM)],
|
||||
_mm256_add_ps(
|
||||
_mm256_loadu_ps(&r[solver_gauss_seidel_get_index(i,j,k,DIM)]),
|
||||
_mm256_mul_ps(
|
||||
betaVec,
|
||||
_mm256_loadu_ps(&p[solver_gauss_seidel_get_index(i,j,k,DIM)])
|
||||
)
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
return (float)sqrt(convergence);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@ -22,7 +22,7 @@
|
||||
/**
|
||||
* Error margin for tests
|
||||
*/
|
||||
#define FLUID_GRID2_PROJECTION_ERROR_MARGIN 0.00001f
|
||||
#define FLUID_GRID2_PROJECTION_ERROR_MARGIN 0.005f
|
||||
|
||||
/**
|
||||
* Maximum convergence we want to see in any chunk
|
||||
|
||||
@ -229,9 +229,9 @@ int fluid_sim_grid2_speed_tests(){
|
||||
|
||||
fluid_grid2_allocate_arrays();
|
||||
|
||||
// rVal += fluid_sim_grid2_speed_test1();
|
||||
// rVal += fluid_sim_grid2_speed_test2();
|
||||
// rVal += fluid_sim_grid2_speed_test3();
|
||||
rVal += fluid_sim_grid2_speed_test1();
|
||||
rVal += fluid_sim_grid2_speed_test2();
|
||||
rVal += fluid_sim_grid2_speed_test3();
|
||||
|
||||
return rVal;
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user