Parallelize part of code
This commit is contained in:
parent
8e6b6e4095
commit
10f3408f0f
Binary file not shown.
@ -40,7 +40,7 @@ rm -f ./*.dll
|
||||
|
||||
|
||||
#compile object files
|
||||
COMPILE_FLAGS="-c -fPIC -m64"
|
||||
COMPILE_FLAGS="-c -fPIC -m64 -mavx -mavx2"
|
||||
INPUT_FILES="./fluidsim.c"
|
||||
OUTPUT_FILE="./fluidsim.o"
|
||||
gcc $COMPILE_FLAGS -I"$BASE_INCLUDE_DIR" -I"$OS_INCLUDE_DIR" $INPUT_FILES -o $OUTPUT_FILE
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
#include <jni.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
|
||||
#define SWAP(x0,x) {float *tmp=x0;x0=x;x=tmp;}
|
||||
@ -128,7 +128,7 @@ void advect(int N, int b, float * d, float * d0, float * u, float * v, float * w
|
||||
|
||||
|
||||
void dens_step(int N, float * x, float * x0, float * u, float * v, float * w, float diff, float dt){
|
||||
// add_source ( N, x, x0, dt );
|
||||
add_source(N, x, x0, dt);
|
||||
SWAP(x0, x);
|
||||
diffuse(N, 0, x, x0, diff, dt);
|
||||
SWAP(x0, x);
|
||||
@ -155,6 +155,8 @@ void vel_step(int N, float * u, float * v, float * w, float * u0, float * v0, fl
|
||||
project(N, u, v, w, u0, v0);
|
||||
}
|
||||
|
||||
float container[16];
|
||||
|
||||
void project(int N, float * u, float * v, float * w, float * p, float * div){
|
||||
int i, j, k;
|
||||
|
||||
@ -168,11 +170,65 @@ void project(int N, float * u, float * v, float * w, float * p, float * div){
|
||||
|
||||
lin_solve(N, 0, p, div, 1, 6);
|
||||
|
||||
for ( i=1 ; i<N-1 ; i++ ) { for ( j=1 ; j<N-1 ; j++ ) { for ( k=1 ; k<N-1 ; k++ ) {
|
||||
u[IX(i,j,k)] -= 0.5f*N*(p[IX(i+1,j,k)]-p[IX(i-1,j,k)]);
|
||||
v[IX(i,j,k)] -= 0.5f*N*(p[IX(i,j+1,k)]-p[IX(i,j-1,k)]);
|
||||
w[IX(i,j,k)] -= 0.5f*N*(p[IX(i,j,k+1)]-p[IX(i,j,k-1)]);
|
||||
}}}
|
||||
|
||||
__m256 constScalar = _mm256_set1_ps(0.5f*N);
|
||||
__m256 vector;
|
||||
__m256 vector2;
|
||||
for ( k=1 ; k<N-1 ; k++ ) {
|
||||
for ( j=1 ; j<N-1 ; j++ ) {
|
||||
//
|
||||
//v
|
||||
//
|
||||
//lower
|
||||
vector = _mm256_loadu_ps(&p[IX(1+1,j,k)]);
|
||||
vector2 = _mm256_loadu_ps(&p[IX(1-1,j,k)]);
|
||||
vector = _mm256_sub_ps(vector,vector2);
|
||||
vector = _mm256_mul_ps(vector,constScalar);
|
||||
vector = _mm256_sub_ps(_mm256_loadu_ps(&u[IX(1,j,k)]),vector);
|
||||
_mm256_storeu_ps(&u[IX(1,j,k)],vector);
|
||||
//upper
|
||||
vector = _mm256_loadu_ps(&p[IX(9+1,j,k)]);
|
||||
vector2 = _mm256_loadu_ps(&p[IX(9-1,j,k)]);
|
||||
vector = _mm256_sub_ps(vector,vector2);
|
||||
vector = _mm256_mul_ps(vector,constScalar);
|
||||
vector = _mm256_sub_ps(_mm256_loadu_ps(&u[IX(9,j,k)]),vector);
|
||||
_mm256_storeu_ps(&u[IX(9,j,k)],vector);
|
||||
//
|
||||
//v
|
||||
//
|
||||
//lower
|
||||
vector = _mm256_loadu_ps(&p[IX(1,j+1,k)]);
|
||||
vector2 = _mm256_loadu_ps(&p[IX(1,j-1,k)]);
|
||||
vector = _mm256_sub_ps(vector,vector2);
|
||||
vector = _mm256_mul_ps(vector,constScalar);
|
||||
vector = _mm256_sub_ps(_mm256_loadu_ps(&v[IX(1,j,k)]),vector);
|
||||
_mm256_storeu_ps(&v[IX(1,j,k)],vector);
|
||||
//upper
|
||||
vector = _mm256_loadu_ps(&p[IX(9,j+1,k)]);
|
||||
vector2 = _mm256_loadu_ps(&p[IX(9,j-1,k)]);
|
||||
vector = _mm256_sub_ps(vector,vector2);
|
||||
vector = _mm256_mul_ps(vector,constScalar);
|
||||
vector = _mm256_sub_ps(_mm256_loadu_ps(&v[IX(9,j,k)]),vector);
|
||||
_mm256_storeu_ps(&v[IX(9,j,k)],vector);
|
||||
//
|
||||
//w
|
||||
//
|
||||
//lower
|
||||
vector = _mm256_loadu_ps(&p[IX(1,j,k+1)]);
|
||||
vector2 = _mm256_loadu_ps(&p[IX(1,j,k-1)]);
|
||||
vector = _mm256_sub_ps(vector,vector2);
|
||||
vector = _mm256_mul_ps(vector,constScalar);
|
||||
vector = _mm256_sub_ps(_mm256_loadu_ps(&w[IX(1,j,k)]),vector);
|
||||
_mm256_storeu_ps(&w[IX(1,j,k)],vector);
|
||||
//upper
|
||||
vector = _mm256_loadu_ps(&p[IX(9,j,k+1)]);
|
||||
vector2 = _mm256_loadu_ps(&p[IX(9,j,k-1)]);
|
||||
vector = _mm256_sub_ps(vector,vector2);
|
||||
vector = _mm256_mul_ps(vector,constScalar);
|
||||
vector = _mm256_sub_ps(_mm256_loadu_ps(&w[IX(9,j,k)]),vector);
|
||||
_mm256_storeu_ps(&w[IX(9,j,k)],vector);
|
||||
}
|
||||
}
|
||||
|
||||
set_bnd(N, 1, u);
|
||||
set_bnd(N, 2, v);
|
||||
@ -181,14 +237,59 @@ void project(int N, float * u, float * v, float * w, float * p, float * div){
|
||||
|
||||
|
||||
void lin_solve(int N, int b, float* x, float* x0, float a, float c){
|
||||
int i, j, k, l;
|
||||
int i, j, k, l, m;
|
||||
|
||||
|
||||
__m256 aScalar = _mm256_set1_ps(a);
|
||||
__m256 cScalar = _mm256_set1_ps(c);
|
||||
// iterate the solver
|
||||
for ( l=0 ; l<LINEARSOLVERTIMES ; l++ ) {
|
||||
// update for each cell
|
||||
for ( i=1 ; i<N-1 ; i++ ) { for ( j=1 ; j<N-1 ; j++ ) { for ( k=1 ; k<N-1 ; k++ ) {
|
||||
x[IX(i,j,k)] = (x0[IX(i,j,k)] + a*(x[IX(i-1,j,k)]+x[IX(i+1,j,k)]+x[IX(i,j-1,k)]+x[IX(i,j+1,k)]+x[IX(i,j,k-1)]+x[IX(i,j,k+1)]))/c;
|
||||
}}}
|
||||
for(k=1; k<N-1; k++){
|
||||
for(j=1; j<N-1; j++){
|
||||
for(int i = 1; i < N-1; i++){
|
||||
x[IX(i,j,k)] = (x0[IX(i,j,k)] + a*(x[IX(i-1,j,k)]+x[IX(i+1,j,k)]+x[IX(i,j-1,k)]+x[IX(i,j+1,k)]+x[IX(i,j,k-1)]+x[IX(i,j,k+1)]))/c;
|
||||
}
|
||||
//solve first item on row
|
||||
// i=1;
|
||||
// x[IX(i,j,k)] = (x0[IX(i,j,k)] + a*(x[IX(i-1,j,k)]+x[IX(i+1,j,k)]+x[IX(i,j-1,k)]+x[IX(i,j+1,k)]+x[IX(i,j,k-1)]+x[IX(i,j,k+1)]))/c;
|
||||
|
||||
// // for(int n = 2; n<N-2;n++){
|
||||
// // x[IX(i,j,n)] = (a*(x[IX(i-1,j,n)]+x[IX(i+1,j,n)]+x[IX(i,j-1,n)]+x[IX(i,j+1,n)]+x[IX(i,j,n-1)]+x[IX(i,j,n+1)]));
|
||||
// // }
|
||||
// // if(x[IX(i,j,2)] > 0.0f){
|
||||
// // printf("%f, %f\n",x[IX(i,j,2)],x0[IX(i,j,2)]);
|
||||
// // }
|
||||
// //vectorize solve as many as can be packed into vector ops
|
||||
// //because we're operating on the innermost loop, guaranteed that k is align memory wise for all offsets
|
||||
// for(i=2; i+8<N-2;i=i+8){
|
||||
// // __m256 vector = _mm256_loadu_ps(&x[IX(i,j,k)]);
|
||||
// // __m256 vector = _mm256_loadu_ps(&x[IX(i-1,j,k)]);
|
||||
// // vector = _mm256_add_ps(vector,_mm256_loadu_ps(&x[IX(i+1,j,k)]));
|
||||
// // vector = _mm256_add_ps(vector,_mm256_loadu_ps(&x[IX(i,j-1,k)]));
|
||||
// // vector = _mm256_add_ps(vector,_mm256_loadu_ps(&x[IX(i,j+1,k)]));
|
||||
// // vector = _mm256_add_ps(vector,_mm256_loadu_ps(&x[IX(i,j,k-1)]));
|
||||
// // vector = _mm256_add_ps(vector,_mm256_loadu_ps(&x[IX(i,j,k+1)]));
|
||||
// // vector = _mm256_mul_ps(vector,aScalar);
|
||||
// // vector = _mm256_add_ps(vector,_mm256_loadu_ps(&x0[IX(i,j,k)]));
|
||||
// // vector = _mm256_div_ps(vector,cScalar);
|
||||
// // _mm256_storeu_ps(&x[IX(i,j,k)],vector);
|
||||
// // x[IX(i,j,k)] = (x0[IX(i,j,k)] + a*(x[IX(i-1,j,k)]+x[IX(i+1,j,k)]+x[IX(i,j-1,k)]+x[IX(i,j+1,k)]+x[IX(i,j,k-1)]+x[IX(i,j,k+1)]))/c;
|
||||
// }
|
||||
// // if(x[IX(i,j,2)] > 0.0f){
|
||||
// // printf("%f, %f\n",x[IX(i,j,2)],x0[IX(i,j,2)]);
|
||||
// // }
|
||||
// for(int n = 2; n<i; n++){
|
||||
// // x[IX(i,j,n)] = (
|
||||
// // (x[IX(i,j,n)] + x[IX(i,j,n-1)] + x[IX(i,j,n+1)])
|
||||
// // * a + x0[IX(i,j,n)])/ c;
|
||||
// x[IX(i,j,n)] = (x0[IX(i,j,n)] + a*(x[IX(i-1,j,n)]+x[IX(i+1,j,n)]+x[IX(i,j-1,n)]+x[IX(i,j+1,n)]+x[IX(i,j,n-1)]+x[IX(i,j,n+1)]))/c;
|
||||
// }
|
||||
// for(; i<N-1; i++){
|
||||
// x[IX(i,j,k)] = (x0[IX(i,j,k)] + a*(x[IX(i-1,j,k)]+x[IX(i+1,j,k)]+x[IX(i,j-1,k)]+x[IX(i,j+1,k)]+x[IX(i,j,k-1)]+x[IX(i,j,k+1)]))/c;
|
||||
// }
|
||||
}
|
||||
}
|
||||
set_bnd(N, b, x);
|
||||
}
|
||||
}
|
||||
|
||||
@ -10,9 +10,9 @@ extern "C" {
|
||||
#undef electrosphere_FluidSim_DIM
|
||||
#define electrosphere_FluidSim_DIM 18L
|
||||
#undef electrosphere_FluidSim_DIFFUSION_CONSTANT
|
||||
#define electrosphere_FluidSim_DIFFUSION_CONSTANT 0.0f
|
||||
#define electrosphere_FluidSim_DIFFUSION_CONSTANT 1.0E-5f
|
||||
#undef electrosphere_FluidSim_VISCOSITY_CONSTANT
|
||||
#define electrosphere_FluidSim_VISCOSITY_CONSTANT 0.0f
|
||||
#define electrosphere_FluidSim_VISCOSITY_CONSTANT 1.0E-5f
|
||||
#undef electrosphere_FluidSim_LINEARSOLVERTIMES
|
||||
#define electrosphere_FluidSim_LINEARSOLVERTIMES 20L
|
||||
#undef electrosphere_FluidSim_GRAVITY
|
||||
|
||||
@ -51,8 +51,8 @@ public class FluidSim {
|
||||
float[] w0ArrayView = new float[DIM * DIM * DIM];
|
||||
|
||||
|
||||
static final float DIFFUSION_CONSTANT = 0.0f;
|
||||
static final float VISCOSITY_CONSTANT = 0.0f;
|
||||
static final float DIFFUSION_CONSTANT = 0.00001f;
|
||||
static final float VISCOSITY_CONSTANT = 0.00001f;
|
||||
|
||||
static final int LINEARSOLVERTIMES = 20;
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user