Parallelize part of code

This commit is contained in:
unknown 2023-07-16 21:41:35 -04:00
parent 8e6b6e4095
commit 10f3408f0f
5 changed files with 117 additions and 16 deletions

Binary file not shown.

View File

@ -40,7 +40,7 @@ rm -f ./*.dll
#compile object files
COMPILE_FLAGS="-c -fPIC -m64"
COMPILE_FLAGS="-c -fPIC -m64 -mavx -mavx2"
INPUT_FILES="./fluidsim.c"
OUTPUT_FILE="./fluidsim.o"
gcc $COMPILE_FLAGS -I"$BASE_INCLUDE_DIR" -I"$OS_INCLUDE_DIR" $INPUT_FILES -o $OUTPUT_FILE

View File

@ -1,6 +1,6 @@
#include <jni.h>
#include <stdio.h>
#include <immintrin.h>
#define SWAP(x0,x) {float *tmp=x0;x0=x;x=tmp;}
@ -128,7 +128,7 @@ void advect(int N, int b, float * d, float * d0, float * u, float * v, float * w
void dens_step(int N, float * x, float * x0, float * u, float * v, float * w, float diff, float dt){
// add_source ( N, x, x0, dt );
add_source(N, x, x0, dt);
SWAP(x0, x);
diffuse(N, 0, x, x0, diff, dt);
SWAP(x0, x);
@ -155,6 +155,8 @@ void vel_step(int N, float * u, float * v, float * w, float * u0, float * v0, fl
project(N, u, v, w, u0, v0);
}
float container[16];
void project(int N, float * u, float * v, float * w, float * p, float * div){
int i, j, k;
@ -168,11 +170,65 @@ void project(int N, float * u, float * v, float * w, float * p, float * div){
lin_solve(N, 0, p, div, 1, 6);
for ( i=1 ; i<N-1 ; i++ ) { for ( j=1 ; j<N-1 ; j++ ) { for ( k=1 ; k<N-1 ; k++ ) {
u[IX(i,j,k)] -= 0.5f*N*(p[IX(i+1,j,k)]-p[IX(i-1,j,k)]);
v[IX(i,j,k)] -= 0.5f*N*(p[IX(i,j+1,k)]-p[IX(i,j-1,k)]);
w[IX(i,j,k)] -= 0.5f*N*(p[IX(i,j,k+1)]-p[IX(i,j,k-1)]);
}}}
__m256 constScalar = _mm256_set1_ps(0.5f*N);
__m256 vector;
__m256 vector2;
for ( k=1 ; k<N-1 ; k++ ) {
for ( j=1 ; j<N-1 ; j++ ) {
//
//v
//
//lower
vector = _mm256_loadu_ps(&p[IX(1+1,j,k)]);
vector2 = _mm256_loadu_ps(&p[IX(1-1,j,k)]);
vector = _mm256_sub_ps(vector,vector2);
vector = _mm256_mul_ps(vector,constScalar);
vector = _mm256_sub_ps(_mm256_loadu_ps(&u[IX(1,j,k)]),vector);
_mm256_storeu_ps(&u[IX(1,j,k)],vector);
//upper
vector = _mm256_loadu_ps(&p[IX(9+1,j,k)]);
vector2 = _mm256_loadu_ps(&p[IX(9-1,j,k)]);
vector = _mm256_sub_ps(vector,vector2);
vector = _mm256_mul_ps(vector,constScalar);
vector = _mm256_sub_ps(_mm256_loadu_ps(&u[IX(9,j,k)]),vector);
_mm256_storeu_ps(&u[IX(9,j,k)],vector);
//
//v
//
//lower
vector = _mm256_loadu_ps(&p[IX(1,j+1,k)]);
vector2 = _mm256_loadu_ps(&p[IX(1,j-1,k)]);
vector = _mm256_sub_ps(vector,vector2);
vector = _mm256_mul_ps(vector,constScalar);
vector = _mm256_sub_ps(_mm256_loadu_ps(&v[IX(1,j,k)]),vector);
_mm256_storeu_ps(&v[IX(1,j,k)],vector);
//upper
vector = _mm256_loadu_ps(&p[IX(9,j+1,k)]);
vector2 = _mm256_loadu_ps(&p[IX(9,j-1,k)]);
vector = _mm256_sub_ps(vector,vector2);
vector = _mm256_mul_ps(vector,constScalar);
vector = _mm256_sub_ps(_mm256_loadu_ps(&v[IX(9,j,k)]),vector);
_mm256_storeu_ps(&v[IX(9,j,k)],vector);
//
//w
//
//lower
vector = _mm256_loadu_ps(&p[IX(1,j,k+1)]);
vector2 = _mm256_loadu_ps(&p[IX(1,j,k-1)]);
vector = _mm256_sub_ps(vector,vector2);
vector = _mm256_mul_ps(vector,constScalar);
vector = _mm256_sub_ps(_mm256_loadu_ps(&w[IX(1,j,k)]),vector);
_mm256_storeu_ps(&w[IX(1,j,k)],vector);
//upper
vector = _mm256_loadu_ps(&p[IX(9,j,k+1)]);
vector2 = _mm256_loadu_ps(&p[IX(9,j,k-1)]);
vector = _mm256_sub_ps(vector,vector2);
vector = _mm256_mul_ps(vector,constScalar);
vector = _mm256_sub_ps(_mm256_loadu_ps(&w[IX(9,j,k)]),vector);
_mm256_storeu_ps(&w[IX(9,j,k)],vector);
}
}
set_bnd(N, 1, u);
set_bnd(N, 2, v);
@ -181,14 +237,59 @@ void project(int N, float * u, float * v, float * w, float * p, float * div){
void lin_solve(int N, int b, float* x, float* x0, float a, float c){
int i, j, k, l;
int i, j, k, l, m;
__m256 aScalar = _mm256_set1_ps(a);
__m256 cScalar = _mm256_set1_ps(c);
// iterate the solver
for ( l=0 ; l<LINEARSOLVERTIMES ; l++ ) {
// update for each cell
for ( i=1 ; i<N-1 ; i++ ) { for ( j=1 ; j<N-1 ; j++ ) { for ( k=1 ; k<N-1 ; k++ ) {
x[IX(i,j,k)] = (x0[IX(i,j,k)] + a*(x[IX(i-1,j,k)]+x[IX(i+1,j,k)]+x[IX(i,j-1,k)]+x[IX(i,j+1,k)]+x[IX(i,j,k-1)]+x[IX(i,j,k+1)]))/c;
}}}
for(k=1; k<N-1; k++){
for(j=1; j<N-1; j++){
for(int i = 1; i < N-1; i++){
x[IX(i,j,k)] = (x0[IX(i,j,k)] + a*(x[IX(i-1,j,k)]+x[IX(i+1,j,k)]+x[IX(i,j-1,k)]+x[IX(i,j+1,k)]+x[IX(i,j,k-1)]+x[IX(i,j,k+1)]))/c;
}
//solve first item on row
// i=1;
// x[IX(i,j,k)] = (x0[IX(i,j,k)] + a*(x[IX(i-1,j,k)]+x[IX(i+1,j,k)]+x[IX(i,j-1,k)]+x[IX(i,j+1,k)]+x[IX(i,j,k-1)]+x[IX(i,j,k+1)]))/c;
// // for(int n = 2; n<N-2;n++){
// // x[IX(i,j,n)] = (a*(x[IX(i-1,j,n)]+x[IX(i+1,j,n)]+x[IX(i,j-1,n)]+x[IX(i,j+1,n)]+x[IX(i,j,n-1)]+x[IX(i,j,n+1)]));
// // }
// // if(x[IX(i,j,2)] > 0.0f){
// // printf("%f, %f\n",x[IX(i,j,2)],x0[IX(i,j,2)]);
// // }
// //vectorize solve as many as can be packed into vector ops
// //because we're operating on the innermost loop, guaranteed that k is align memory wise for all offsets
// for(i=2; i+8<N-2;i=i+8){
// // __m256 vector = _mm256_loadu_ps(&x[IX(i,j,k)]);
// // __m256 vector = _mm256_loadu_ps(&x[IX(i-1,j,k)]);
// // vector = _mm256_add_ps(vector,_mm256_loadu_ps(&x[IX(i+1,j,k)]));
// // vector = _mm256_add_ps(vector,_mm256_loadu_ps(&x[IX(i,j-1,k)]));
// // vector = _mm256_add_ps(vector,_mm256_loadu_ps(&x[IX(i,j+1,k)]));
// // vector = _mm256_add_ps(vector,_mm256_loadu_ps(&x[IX(i,j,k-1)]));
// // vector = _mm256_add_ps(vector,_mm256_loadu_ps(&x[IX(i,j,k+1)]));
// // vector = _mm256_mul_ps(vector,aScalar);
// // vector = _mm256_add_ps(vector,_mm256_loadu_ps(&x0[IX(i,j,k)]));
// // vector = _mm256_div_ps(vector,cScalar);
// // _mm256_storeu_ps(&x[IX(i,j,k)],vector);
// // x[IX(i,j,k)] = (x0[IX(i,j,k)] + a*(x[IX(i-1,j,k)]+x[IX(i+1,j,k)]+x[IX(i,j-1,k)]+x[IX(i,j+1,k)]+x[IX(i,j,k-1)]+x[IX(i,j,k+1)]))/c;
// }
// // if(x[IX(i,j,2)] > 0.0f){
// // printf("%f, %f\n",x[IX(i,j,2)],x0[IX(i,j,2)]);
// // }
// for(int n = 2; n<i; n++){
// // x[IX(i,j,n)] = (
// // (x[IX(i,j,n)] + x[IX(i,j,n-1)] + x[IX(i,j,n+1)])
// // * a + x0[IX(i,j,n)])/ c;
// x[IX(i,j,n)] = (x0[IX(i,j,n)] + a*(x[IX(i-1,j,n)]+x[IX(i+1,j,n)]+x[IX(i,j-1,n)]+x[IX(i,j+1,n)]+x[IX(i,j,n-1)]+x[IX(i,j,n+1)]))/c;
// }
// for(; i<N-1; i++){
// x[IX(i,j,k)] = (x0[IX(i,j,k)] + a*(x[IX(i-1,j,k)]+x[IX(i+1,j,k)]+x[IX(i,j-1,k)]+x[IX(i,j+1,k)]+x[IX(i,j,k-1)]+x[IX(i,j,k+1)]))/c;
// }
}
}
set_bnd(N, b, x);
}
}

View File

@ -10,9 +10,9 @@ extern "C" {
#undef electrosphere_FluidSim_DIM
#define electrosphere_FluidSim_DIM 18L
#undef electrosphere_FluidSim_DIFFUSION_CONSTANT
#define electrosphere_FluidSim_DIFFUSION_CONSTANT 0.0f
#define electrosphere_FluidSim_DIFFUSION_CONSTANT 1.0E-5f
#undef electrosphere_FluidSim_VISCOSITY_CONSTANT
#define electrosphere_FluidSim_VISCOSITY_CONSTANT 0.0f
#define electrosphere_FluidSim_VISCOSITY_CONSTANT 1.0E-5f
#undef electrosphere_FluidSim_LINEARSOLVERTIMES
#define electrosphere_FluidSim_LINEARSOLVERTIMES 20L
#undef electrosphere_FluidSim_GRAVITY

View File

@ -51,8 +51,8 @@ public class FluidSim {
float[] w0ArrayView = new float[DIM * DIM * DIM];
static final float DIFFUSION_CONSTANT = 0.0f;
static final float VISCOSITY_CONSTANT = 0.0f;
static final float DIFFUSION_CONSTANT = 0.00001f;
static final float VISCOSITY_CONSTANT = 0.00001f;
static final int LINEARSOLVERTIMES = 20;