fully static inline parallel multigrid solver

2024-12-11 19:29:54 -05:00 · 2024-12-11 19:29:54 -05:00 · bc63cfd06c
commit bc63cfd06c
parent 99e89c9d88
4 changed files with 366 additions and 26 deletions
--- a/src/main/c/includes/math/ode/multigrid.h
+++ b/src/main/c/includes/math/ode/multigrid.h
@ -27,6 +27,58 @@ float solver_multigrid_iterate_serial(float * phi, float * phi0, float a, float
 */
 float solver_multigrid_iterate_parallel(float * phi, float * phi0, float a, float c);
 /**
 * Relaxes an ODE matrix by 1 iteration of multigrid method
 * @param phi The phi array
 * @param phi0 The phi array from the last frame
 * @param a The a const
 * @param c The c const
 * @param GRIDDIM The dimension of the phi grid
 * @return The residual
 */
 void solver_multigrid_iterate_serial_recursive(float * phi, float * phi0, float a, float c, int GRIDDIM);
 /**
 * Verifies that all grids are allocated
 */
 void solver_multigrid_initialization_check();
 /**
 * Calculates the residual of the grid
 * @return Returns the residual norm of the grid
 */
 float solver_multigrid_calculate_residual_norm_serial(float * phi, float * phi0, float a, float c, int GRIDDIM);
 /**
 * Serially restricts the current residual into the lower phi grid
 */
 void solver_multigrid_restrict_serial(float * currResidual, int GRIDDIM, float * lowerPhi, float * lowerPhi0, int LOWERDIM);
 /**
 * Prolongates a lower grid into a higher grid
 */
 void solver_multigrid_prolongate_serial(float * phi, int GRIDDIM, float * lowerPhi, int LOWERDIM);
 /**
 * Gets the phi to use for the current level of the multigrid solver
 */
 float * solver_multigrid_get_current_phi(int dim);
 /**
 * Gets the phi0 to use for the current level of the multigrid solver
 */
 float * solver_multigrid_get_current_phi0(int dim);
 /**
 * Gets the residual to use for the current level of the multigrid solver
 */
 float * solver_multigrid_get_current_residual(int dim);
 /**
 * Calculates the residual of the grid
 */
 void solver_multigrid_store_residual_serial(float * phi, float * phi0, float * residualGrid, float a, float c, int GRIDDIM);
 #endif
--- a/src/main/c/includes/math/ode/multigrid_parallel.h
+++ b/src/main/c/includes/math/ode/multigrid_parallel.h
@ -0,0 +1,294 @@
 #ifndef MATH_ODE_MULTIGRID_PARALLEL_H
 #define MATH_ODE_MULTIGRID_PARALLEL_H
 #include "math/ode/gauss_seidel.h"
 #include "math/ode/multigrid.h"
 #define MULTIGRID_PARALLEL_LOWEST_PARALLEL_DIM ((DIM - 2) / 1) + 2
 /**
 * Serially restricts the current residual into the lower phi grid
 */
 static inline void restrict_parallel(float * currResidual, int GRIDDIM, float * lowerPhi, float * lowerPhi0, int LOWERDIM){
    if(LOWERDIM < 10){
        solver_multigrid_restrict_serial(currResidual,GRIDDIM,lowerPhi,lowerPhi0,LOWERDIM);
        return;
    }
    int x, y, z;
    __m256 zeroVec = _mm256_setzero_ps();
    __m256 residuals;
    __m256i offsets = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
    //
    //set first plane
    //
    x = 0;
    for(y = 0; y < LOWERDIM; y++){
        for(z = 0; z < LOWERDIM-7; z=z+8){
            _mm256_storeu_ps(&lowerPhi[solver_gauss_seidel_get_index(x,y,z,LOWERDIM)],zeroVec);
        }
        z = LOWERDIM - 8;
        _mm256_storeu_ps(&lowerPhi[solver_gauss_seidel_get_index(x,y,z,LOWERDIM)],zeroVec);
    }
    //
    //set main volume
    //
    for(x = 1; x < LOWERDIM - 1; x++){
        //
        //set the first edge
        //
        y = 0;
        for(z = 0; z < LOWERDIM-7; z=z+8){
            _mm256_storeu_ps(&lowerPhi[solver_gauss_seidel_get_index(x,y,z,LOWERDIM)],zeroVec);
        }
        z = LOWERDIM - 8;
        _mm256_storeu_ps(&lowerPhi[solver_gauss_seidel_get_index(x,y,z,LOWERDIM)],zeroVec);
        //
        //copy the main contents
        //
        for(y = 1; y < LOWERDIM - 1; y++){
            lowerPhi[solver_gauss_seidel_get_index(x,y,0,LOWERDIM)] = 0;
            for(z = 1; z < LOWERDIM-7; z=z+8){
                _mm256_storeu_ps(&lowerPhi[solver_gauss_seidel_get_index(x,y,z,LOWERDIM)],zeroVec);
                residuals = _mm256_i32gather_ps(&currResidual[solver_gauss_seidel_get_index(x*2,y*2,z*2,GRIDDIM)],offsets,sizeof(float));
                _mm256_storeu_ps(&lowerPhi0[solver_gauss_seidel_get_index(x,y,z,LOWERDIM)],residuals);
            }
            lowerPhi[solver_gauss_seidel_get_index(x,y,LOWERDIM - 1,LOWERDIM)] = 0;
        }
        //
        //set the last edge
        //
        for(z = 0; z < LOWERDIM-7; z=z+8){
            _mm256_storeu_ps(&lowerPhi[solver_gauss_seidel_get_index(x,y,z,LOWERDIM)],zeroVec);
        }
        z = LOWERDIM - 8;
        _mm256_storeu_ps(&lowerPhi[solver_gauss_seidel_get_index(x,y,z,LOWERDIM)],zeroVec);
    }
    //
    //set last plane
    //  
    x = LOWERDIM - 1;
    for(y = 0; y < LOWERDIM; y++){
        for(z = 0; z < LOWERDIM-7; z=z+8){
            _mm256_storeu_ps(&lowerPhi[solver_gauss_seidel_get_index(x,y,z,LOWERDIM)],zeroVec);
        }
        //zero out the end
        z = LOWERDIM - 8;
        _mm256_storeu_ps(&lowerPhi[solver_gauss_seidel_get_index(x,y,z,LOWERDIM)],zeroVec);
    }
    //populate grid
    for(x = 1; x < LOWERDIM - 1; x++){
        for(y = 1; y < LOWERDIM - 1; y++){
            for(z = 1; z < LOWERDIM - 1; z++){
                //direct transfer operator (faster, lower accuracy)
                lowerPhi0[solver_gauss_seidel_get_index(x,y,z,LOWERDIM)] = currResidual[solver_gauss_seidel_get_index(x*2,y*2,z*2,GRIDDIM)];
            }
        }
    }
 }
 /**
 * Calculates the residual of the grid
 */
 static inline float solver_multigrid_calculate_residual_parallel(float * phi, float * phi0, float a, float c, int GRIDDIM){
    //calculate residual
    __m256 aScalar = _mm256_set1_ps(a);
    __m256 cScalar = _mm256_set1_ps(c);
    __m256 collector = _mm256_setzero_ps();
    __m256 vector;
    float vec_sum_storage[8];
    float residual = 0;
    //transform u direction
    int i, j, k;
    int increment = 0;
    for(k=1; k<GRIDDIM-1; k++){
        for(j=1; j<GRIDDIM-1; j++){
            for(i=1; i<GRIDDIM-1; i=i+8){
                vector = _mm256_loadu_ps(&phi[solver_gauss_seidel_get_index(i-1,j,k,GRIDDIM)]);
                vector = _mm256_add_ps(vector,_mm256_loadu_ps(&phi[solver_gauss_seidel_get_index(i+1,j,k,GRIDDIM)]));
                vector = _mm256_add_ps(vector,_mm256_loadu_ps(&phi[solver_gauss_seidel_get_index(i,j-1,k,GRIDDIM)]));
                vector = _mm256_add_ps(vector,_mm256_loadu_ps(&phi[solver_gauss_seidel_get_index(i,j+1,k,GRIDDIM)]));
                vector = _mm256_add_ps(vector,_mm256_loadu_ps(&phi[solver_gauss_seidel_get_index(i,j,k-1,GRIDDIM)]));
                vector = _mm256_add_ps(vector,_mm256_loadu_ps(&phi[solver_gauss_seidel_get_index(i,j,k+1,GRIDDIM)]));
                vector = _mm256_mul_ps(vector,aScalar);
                vector = _mm256_add_ps(vector,_mm256_loadu_ps(&phi0[solver_gauss_seidel_get_index(i,j,k,GRIDDIM)]));
                vector = _mm256_div_ps(vector,cScalar);
                collector = _mm256_add_ps(collector,vector);
                increment++;
                if(increment > GRIDDIM*GRIDDIM*GRIDDIM){
                    printf("INCREMENT FAILURE   %d %d %d %d \n",i,j,k,increment);
                    return -1;
                }
            }
        }
    }
    _mm256_storeu_ps(vec_sum_storage,collector);
    residual =
        vec_sum_storage[0] + vec_sum_storage[1] + 
        vec_sum_storage[2] + vec_sum_storage[3] + 
        vec_sum_storage[4] + vec_sum_storage[5] + 
        vec_sum_storage[6] + vec_sum_storage[7]
    ;
    return residual;
 }
 /**
 * Prolongates a lower grid into a higher grid
 */
 static inline void prolongate_parallel(float * phi, int GRIDDIM, float * lowerPhi, int LOWERDIM){
    if(LOWERDIM < 10){
        solver_multigrid_prolongate_serial(phi,GRIDDIM,lowerPhi,LOWERDIM);
        return;
    }
    __m256i offsets = _mm256_set_epi32(0, 0, 1, 1, 2, 2, 3, 3);
    __m256 lowerPhiVec;
    __m256 phiVec;
    int x, y, z;
    for(int x = 1; x < GRIDDIM - 1; x++){
        for(int y = 1; y < GRIDDIM - 1; y++){
            for(int z = 1; z < GRIDDIM - 1; z=z+8){
                phiVec = _mm256_loadu_ps(&phi[solver_gauss_seidel_get_index(x,y,z,GRIDDIM)]);
                lowerPhiVec = _mm256_i32gather_ps(&lowerPhi[solver_gauss_seidel_get_index(x/2,y/2,z/2,LOWERDIM)],offsets,sizeof(float));
                _mm256_storeu_ps(
                    &phi[solver_gauss_seidel_get_index(x,y,z,GRIDDIM)],
                    _mm256_add_ps(
                        phiVec,
                        lowerPhiVec
                    )
                );
            }
        }
    }
 }
 /**
 * Calculates the residual of the grid
 */
 static inline void solver_multigrid_parallel_store_residual(float * phi, float * phi0, float * residualGrid, float a, float c, int GRIDDIM){
    if(GRIDDIM < 10){
        solver_multigrid_store_residual_serial(phi,phi0,residualGrid,a,c,GRIDDIM);
        return;
    }
    __m256 laplacian;
    __m256 constVec = _mm256_set1_ps(6);
    //calculate residual
    int i, j, k;
    for(k=1; k<GRIDDIM-1; k++){
        for(j=1; j<GRIDDIM-1; j++){
            for(i=1; i<GRIDDIM-1; i=i+8){
                laplacian = 
                _mm256_sub_ps(
                    _mm256_mul_ps(
                        _mm256_loadu_ps(&phi[solver_gauss_seidel_get_index(i,j,k,GRIDDIM)]),
                        constVec
                    ),
                    _mm256_add_ps(
                        _mm256_add_ps(
                            _mm256_add_ps(
                                _mm256_loadu_ps(&phi[solver_gauss_seidel_get_index(i-1,j,k,GRIDDIM)]),
                                _mm256_loadu_ps(&phi[solver_gauss_seidel_get_index(i+1,j,k,GRIDDIM)])
                            ),
                            _mm256_add_ps(
                                _mm256_loadu_ps(&phi[solver_gauss_seidel_get_index(i,j-1,k,GRIDDIM)]),
                                _mm256_loadu_ps(&phi[solver_gauss_seidel_get_index(i,j+1,k,GRIDDIM)])
                            )
                        ),
                        _mm256_add_ps(
                            _mm256_loadu_ps(&phi[solver_gauss_seidel_get_index(i,j,k-1,GRIDDIM)]),
                            _mm256_loadu_ps(&phi[solver_gauss_seidel_get_index(i,j,k+1,GRIDDIM)])
                        )
                    )
                );
                _mm256_storeu_ps(
                    &residualGrid[solver_gauss_seidel_get_index(i,j,k,GRIDDIM)],
                    _mm256_sub_ps(
                        _mm256_loadu_ps(&phi0[solver_gauss_seidel_get_index(i,j,k,GRIDDIM)]),
                        laplacian
                    )
                );
            }
        }
    }
 }
 /**
 * Relaxes an ODE matrix by 1 iteration of multigrid method
 * @param phi The phi array
 * @param phi0 The phi array from the last frame
 * @param a The a const
 * @param c The c const
 * @param GRIDDIM The dimension of the phi grid
 * @return The residual
 */
 static inline void solver_multigrid_parallel_iterate_recurse(float * phi, float * phi0, float a, float c, int GRIDDIM){
    int LOWERDIM = ((GRIDDIM - 2) / 2) + 2;
    float * currResidual = solver_multigrid_get_current_residual(GRIDDIM);
    float * lowerPhi = solver_multigrid_get_current_phi(LOWERDIM);
    float * lowerPhi0 = solver_multigrid_get_current_phi0(LOWERDIM);
    float * lowerResidual = solver_multigrid_get_current_residual(LOWERDIM);
    //smooth
    solver_gauss_seidel_iterate_parallel(phi,phi0,a,c,GRIDDIM);
    //compute residuals
    solver_multigrid_parallel_store_residual(phi,phi0,currResidual,a,c,GRIDDIM);
    //restrict
    restrict_parallel(currResidual,GRIDDIM,lowerPhi,lowerPhi0,LOWERDIM);
    //solve next-coarsest grid
    if(GRIDDIM <= MULTIGRID_PARALLEL_LOWEST_PARALLEL_DIM){
        solver_multigrid_iterate_serial_recursive(lowerPhi,lowerPhi0,a,c,LOWERDIM);
    } else {
        solver_multigrid_parallel_iterate_recurse(lowerPhi,lowerPhi0,a,c,LOWERDIM);
    }
    //interpolate from the lower grid
    prolongate_parallel(phi,GRIDDIM,lowerPhi,LOWERDIM);
    //smooth
    solver_gauss_seidel_iterate_parallel(phi,phi0,a,c,GRIDDIM);
 }
 /**
 * Relaxes an ODE matrix by 1 iteration of multigrid method
 * @param phi The phi array
 * @param phi0 The phi array from the last frame
 * @param a The a const
 * @param c The c const
 * @return The residual
 */
 static inline float solver_multigrid_parallel_iterate(float * phi, float * phi0, float a, float c){
    solver_multigrid_initialization_check();
    solver_multigrid_parallel_iterate_recurse(phi,phi0,a,c,DIM);
    return solver_multigrid_calculate_residual_norm_serial(phi,phi0,a,c,DIM);
 }
 #endif
--- a/src/main/c/src/fluid/sim/grid2/velocity.c
+++ b/src/main/c/src/fluid/sim/grid2/velocity.c
@ -9,7 +9,7 @@
 #include "fluid/sim/grid2/velocity.h"
 #include "fluid/sim/grid2/utilities.h"
 #include "math/ode/gauss_seidel.h"
-#include "math/ode/multigrid.h"
+#include "math/ode/multigrid_parallel.h"
 #include "math/ode/conjugate_gradient.h"
 #define SET_BOUND_IGNORE 0
@ -181,7 +181,7 @@ LIBRARY_API void fluid_grid2_solveProjection(
    float residual = 1;
    int iteration = 0;
    while(iteration < FLUID_GRID2_LINEARSOLVERTIMES && (residual > FLUID_GRID2_PROJECTION_CONVERGENCE_TOLERANCE || residual < -FLUID_GRID2_PROJECTION_CONVERGENCE_TOLERANCE)){
-        residual = solver_multigrid_iterate_parallel(p,div,a,c);
+        residual = solver_multigrid_parallel_iterate(p,div,a,c);
        fluid_grid2_set_bounds(FLUID_GRID2_BOUND_NO_DIR,p);
        iteration++;
    }
--- a/src/main/c/src/math/ode/multigrid.c
+++ b/src/main/c/src/math/ode/multigrid.c
@ -51,12 +51,6 @@ static float * eighthGridResidual = NULL;
 float solver_multigrid_calculate_residual_norm_serial(float * phi, float * phi0, float a, float c, int GRIDDIM);
 float solver_multigrid_calculate_residual_parallel(float * phi, float * phi0, float a, float c, int GRIDDIM);
 void solver_multigrid_store_residual_serial(float * phi, float * phi0, float * residualGrid, float a, float c, int GRIDDIM);
 float * get_current_phi(int dim);
 float * get_current_phi0(int dim);
 float * get_current_residual(int dim);
 void initialization_check();
 void restrict_serial(float * currResidual, int GRIDDIM, float * lowerPhi, float * lowerPhi0, int LOWERDIM);
 void prolongate_serial(float * phi, int GRIDDIM, float * lowerPhi, int LOWERDIM);
 //parallelized operations
 void restrict_parallel(float * currResidual, int GRIDDIM, float * lowerPhi, float * lowerPhi0, int LOWERDIM);
@ -74,10 +68,10 @@ void solver_multigrid_store_residual_parallel(float * phi, float * phi0, float *
 */
 void solver_multigrid_iterate_serial_recursive(float * phi, float * phi0, float a, float c, int GRIDDIM){
    int LOWERDIM = ((GRIDDIM - 2) / 2) + 2;
-    float * currResidual = get_current_residual(GRIDDIM);
+    float * currResidual = solver_multigrid_get_current_residual(GRIDDIM);
-    float * lowerPhi = get_current_phi(LOWERDIM);
+    float * lowerPhi = solver_multigrid_get_current_phi(LOWERDIM);
-    float * lowerPhi0 = get_current_phi0(LOWERDIM);
+    float * lowerPhi0 = solver_multigrid_get_current_phi0(LOWERDIM);
-    float * lowerResidual = get_current_residual(LOWERDIM);
+    float * lowerResidual = solver_multigrid_get_current_residual(LOWERDIM);
    //smooth
    solver_gauss_seidel_iterate_parallel(phi,phi0,a,c,GRIDDIM);
@ -133,7 +127,7 @@ void solver_multigrid_iterate_serial_recursive(float * phi, float * phi0, float
 * @return The residual
 */
 float solver_multigrid_iterate_serial(float * phi, float * phi0, float a, float c){
-    initialization_check();
+    solver_multigrid_initialization_check();
    solver_multigrid_iterate_serial_recursive(phi,phi0,a,c,DIM);
    return solver_multigrid_calculate_residual_norm_serial(phi,phi0,a,c,DIM);
 }
@ -150,10 +144,10 @@ float solver_multigrid_iterate_serial(float * phi, float * phi0, float a, float
 */
 void solver_multigrid_iterate_parallel_recursive(float * phi, float * phi0, float a, float c, int GRIDDIM){
    int LOWERDIM = ((GRIDDIM - 2) / 2) + 2;
-    float * currResidual = get_current_residual(GRIDDIM);
+    float * currResidual = solver_multigrid_get_current_residual(GRIDDIM);
-    float * lowerPhi = get_current_phi(LOWERDIM);
+    float * lowerPhi = solver_multigrid_get_current_phi(LOWERDIM);
-    float * lowerPhi0 = get_current_phi0(LOWERDIM);
+    float * lowerPhi0 = solver_multigrid_get_current_phi0(LOWERDIM);
-    float * lowerResidual = get_current_residual(LOWERDIM);
+    float * lowerResidual = solver_multigrid_get_current_residual(LOWERDIM);
    //smooth
    solver_gauss_seidel_iterate_parallel(phi,phi0,a,c,GRIDDIM);
@ -187,7 +181,7 @@ void solver_multigrid_iterate_parallel_recursive(float * phi, float * phi0, floa
 * @return The residual
 */
 float solver_multigrid_iterate_parallel(float * phi, float * phi0, float a, float c){
-    initialization_check();
+    solver_multigrid_initialization_check();
    solver_multigrid_iterate_parallel_recursive(phi,phi0,a,c,DIM);
    return solver_multigrid_calculate_residual_norm_serial(phi,phi0,a,c,DIM);
 }
@ -197,7 +191,7 @@ float solver_multigrid_iterate_parallel(float * phi, float * phi0, float a, floa
 /**
 * Gets the phi to use for the current level of the multigrid solver
 */
-float * get_current_phi(int dim){
+float * solver_multigrid_get_current_phi(int dim){
    if(dim == ((DIM-2)/2) + 2){
        return halfGridPhi;
    } else if(dim == ((DIM-2)/4) + 2){
@ -214,7 +208,7 @@ float * get_current_phi(int dim){
 /**
 * Gets the phi0 to use for the current level of the multigrid solver
 */
-float * get_current_phi0(int dim){
+float * solver_multigrid_get_current_phi0(int dim){
    if(dim == ((DIM-2)/2) + 2){
        return halfGridPhi0;
    } else if(dim == ((DIM-2)/4) + 2){
@ -231,7 +225,7 @@ float * get_current_phi0(int dim){
 /**
 * Gets the residual to use for the current level of the multigrid solver
 */
-float * get_current_residual(int dim){
+float * solver_multigrid_get_current_residual(int dim){
    if(dim == 18){
        return fullGridResidual;
    } else if(dim == ((DIM-2)/2) + 2){
@ -250,7 +244,7 @@ float * get_current_residual(int dim){
 /**
 * Serially restricts the current residual into the lower phi grid
 */
-void restrict_serial(float * currResidual, int GRIDDIM, float * lowerPhi, float * lowerPhi0, int LOWERDIM){
+void solver_multigrid_restrict_serial(float * currResidual, int GRIDDIM, float * lowerPhi, float * lowerPhi0, int LOWERDIM){
    int x, y, z;
    //restrict
    //(current operator is injection -- inject r^2 from this grid at phi0 of the smaller grid)
@ -278,7 +272,7 @@ void restrict_serial(float * currResidual, int GRIDDIM, float * lowerPhi, float
 */
 void restrict_parallel(float * currResidual, int GRIDDIM, float * lowerPhi, float * lowerPhi0, int LOWERDIM){
    if(LOWERDIM < 10){
-        restrict_serial(currResidual,GRIDDIM,lowerPhi,lowerPhi0,LOWERDIM);
+        solver_multigrid_restrict_serial(currResidual,GRIDDIM,lowerPhi,lowerPhi0,LOWERDIM);
        return;
    }
    int x, y, z;
@ -364,7 +358,7 @@ void restrict_parallel(float * currResidual, int GRIDDIM, float * lowerPhi, floa
 /**
 * Prolongates a lower grid into a higher grid
 */
-void prolongate_serial(float * phi, int GRIDDIM, float * lowerPhi, int LOWERDIM){
+void solver_multigrid_prolongate_serial(float * phi, int GRIDDIM, float * lowerPhi, int LOWERDIM){
    int x, y, z;
    for(int x = 1; x < GRIDDIM - 1; x++){
        for(int y = 1; y < GRIDDIM - 1; y++){
@ -384,7 +378,7 @@ void prolongate_serial(float * phi, int GRIDDIM, float * lowerPhi, int LOWERDIM)
 */
 void prolongate_parallel(float * phi, int GRIDDIM, float * lowerPhi, int LOWERDIM){
    if(LOWERDIM < 10){
-        prolongate_serial(phi,GRIDDIM,lowerPhi,LOWERDIM);
+        solver_multigrid_prolongate_serial(phi,GRIDDIM,lowerPhi,LOWERDIM);
        return;
    }
    __m256i offsets = _mm256_set_epi32(0, 0, 1, 1, 2, 2, 3, 3);
@ -411,7 +405,7 @@ void prolongate_parallel(float * phi, int GRIDDIM, float * lowerPhi, int LOWERDI
 /**
 * Verifies that all grids are allocated
 */
-void initialization_check(){
+void solver_multigrid_initialization_check(){
    // full res
    if(fullGridResidual == NULL){
        fullGridResidual = (float *)calloc(1,DIM * DIM * DIM * sizeof(float));