Parallelize linear solver

This commit is contained in:
unknown 2023-07-16 22:10:23 -04:00
parent 10f3408f0f
commit 25b74c7f93
3 changed files with 21 additions and 42 deletions

3
.gitignore vendored
View File

@ -6,4 +6,5 @@
/.classpath
/.project
/.vscode
/shared-folder
/shared-folder
/shared-folder/**

View File

@ -54,6 +54,7 @@ INPUT_FILES="fluidsim.o"
gcc $COMPILE_FLAGS $INPUT_FILES -o $OUTPUT_FILE
#move to resources
mkdir -p ../../../shared-folder
mv "./libfluidsim$LIB_ENDING" "../../../shared-folder/"
#clean compile dir

View File

@ -6,7 +6,7 @@
#define SWAP(x0,x) {float *tmp=x0;x0=x;x=tmp;}
#define IX(i,j,k) ((i)+(N)*(j)+(N*N)*(k))
#define LINEARSOLVERTIMES 10
#define LINEARSOLVERTIMES 5
@ -247,47 +247,24 @@ void lin_solve(int N, int b, float* x, float* x0, float a, float c){
// update for each cell
for(k=1; k<N-1; k++){
for(j=1; j<N-1; j++){
for(int i = 1; i < N-1; i++){
x[IX(i,j,k)] = (x0[IX(i,j,k)] + a*(x[IX(i-1,j,k)]+x[IX(i+1,j,k)]+x[IX(i,j-1,k)]+x[IX(i,j+1,k)]+x[IX(i,j,k-1)]+x[IX(i,j,k+1)]))/c;
int n = 0;
for(i = 1; i < N-1; i=i+8){
__m256 vector = _mm256_loadu_ps(&x[IX(i-1,j,k)]);
vector = _mm256_add_ps(vector,_mm256_loadu_ps(&x[IX(i+1,j,k)]));
vector = _mm256_add_ps(vector,_mm256_loadu_ps(&x[IX(i,j-1,k)]));
vector = _mm256_add_ps(vector,_mm256_loadu_ps(&x[IX(i,j+1,k)]));
vector = _mm256_add_ps(vector,_mm256_loadu_ps(&x[IX(i,j,k-1)]));
vector = _mm256_add_ps(vector,_mm256_loadu_ps(&x[IX(i,j,k+1)]));
vector = _mm256_mul_ps(vector,aScalar);
vector = _mm256_add_ps(vector,_mm256_loadu_ps(&x0[IX(i,j,k)]));
vector = _mm256_div_ps(vector,cScalar);
_mm256_storeu_ps(&x[IX(i,j,k)],vector);
}
if(i>N-1){
for(i=i-8; i < N-1; i++){
x[IX(i,j,k)] = (x0[IX(i,j,k)] + a*(x[IX(i-1,j,k)]+x[IX(i+1,j,k)]+x[IX(i,j-1,k)]+x[IX(i,j+1,k)]+x[IX(i,j,k-1)]+x[IX(i,j,k+1)]))/c;
}
}
//solve first item on row
// i=1;
// x[IX(i,j,k)] = (x0[IX(i,j,k)] + a*(x[IX(i-1,j,k)]+x[IX(i+1,j,k)]+x[IX(i,j-1,k)]+x[IX(i,j+1,k)]+x[IX(i,j,k-1)]+x[IX(i,j,k+1)]))/c;
// // for(int n = 2; n<N-2;n++){
// // x[IX(i,j,n)] = (a*(x[IX(i-1,j,n)]+x[IX(i+1,j,n)]+x[IX(i,j-1,n)]+x[IX(i,j+1,n)]+x[IX(i,j,n-1)]+x[IX(i,j,n+1)]));
// // }
// // if(x[IX(i,j,2)] > 0.0f){
// // printf("%f, %f\n",x[IX(i,j,2)],x0[IX(i,j,2)]);
// // }
// //vectorize solve as many as can be packed into vector ops
// //because we're operating on the innermost loop, guaranteed that k is align memory wise for all offsets
// for(i=2; i+8<N-2;i=i+8){
// // __m256 vector = _mm256_loadu_ps(&x[IX(i,j,k)]);
// // __m256 vector = _mm256_loadu_ps(&x[IX(i-1,j,k)]);
// // vector = _mm256_add_ps(vector,_mm256_loadu_ps(&x[IX(i+1,j,k)]));
// // vector = _mm256_add_ps(vector,_mm256_loadu_ps(&x[IX(i,j-1,k)]));
// // vector = _mm256_add_ps(vector,_mm256_loadu_ps(&x[IX(i,j+1,k)]));
// // vector = _mm256_add_ps(vector,_mm256_loadu_ps(&x[IX(i,j,k-1)]));
// // vector = _mm256_add_ps(vector,_mm256_loadu_ps(&x[IX(i,j,k+1)]));
// // vector = _mm256_mul_ps(vector,aScalar);
// // vector = _mm256_add_ps(vector,_mm256_loadu_ps(&x0[IX(i,j,k)]));
// // vector = _mm256_div_ps(vector,cScalar);
// // _mm256_storeu_ps(&x[IX(i,j,k)],vector);
// // x[IX(i,j,k)] = (x0[IX(i,j,k)] + a*(x[IX(i-1,j,k)]+x[IX(i+1,j,k)]+x[IX(i,j-1,k)]+x[IX(i,j+1,k)]+x[IX(i,j,k-1)]+x[IX(i,j,k+1)]))/c;
// }
// // if(x[IX(i,j,2)] > 0.0f){
// // printf("%f, %f\n",x[IX(i,j,2)],x0[IX(i,j,2)]);
// // }
// for(int n = 2; n<i; n++){
// // x[IX(i,j,n)] = (
// // (x[IX(i,j,n)] + x[IX(i,j,n-1)] + x[IX(i,j,n+1)])
// // * a + x0[IX(i,j,n)])/ c;
// x[IX(i,j,n)] = (x0[IX(i,j,n)] + a*(x[IX(i-1,j,n)]+x[IX(i+1,j,n)]+x[IX(i,j-1,n)]+x[IX(i,j+1,n)]+x[IX(i,j,n-1)]+x[IX(i,j,n+1)]))/c;
// }
// for(; i<N-1; i++){
// x[IX(i,j,k)] = (x0[IX(i,j,k)] + a*(x[IX(i-1,j,k)]+x[IX(i+1,j,k)]+x[IX(i,j-1,k)]+x[IX(i,j+1,k)]+x[IX(i,j,k-1)]+x[IX(i,j,k+1)]))/c;
// }
}
}
set_bnd(N, b, x);