fluid-sim/src/main/c/velocitystep.c

#include <jni.h>
#include <stdio.h>
#include <immintrin.h>
#include <stdint.h>

#include "includes/utilities.h"
#include "includes/chunkmask.h"


#define BOUND_NO_DIR 0
#define BOUND_DIR_U 1
#define BOUND_DIR_V 2
#define BOUND_DIR_W 3

#define SET_BOUND_IGNORE 0
#define SET_BOUND_USE_NEIGHBOR 1

void add_source(int N, float * x, float * s, float dt);
void advect(JNIEnv * env, uint32_t chunk_mask, int N, int b, jobjectArray jrd, jobjectArray jrd0, float * u, float * v, float * w, float dt);


/*
 * Adds force to all vectors
 */
JNIEXPORT void JNICALL Java_electrosphere_FluidSim_addSourceToVectors
  (JNIEnv * env,
  jobject this,
  jint N,
  jint chunk_mask,
  jobjectArray jru,
  jobjectArray jrv,
  jobjectArray jrw,
  jobjectArray jru0,
  jobjectArray jrv0,
  jobjectArray jrw0,
  jfloat DIFFUSION_CONST,
  jfloat VISCOSITY_CONST,
  jfloat dt){
    add_source(N,GET_ARR(env,jru,CENTER_LOC),GET_ARR(env,jru0,CENTER_LOC),dt);
    add_source(N,GET_ARR(env,jrv,CENTER_LOC),GET_ARR(env,jrv0,CENTER_LOC),dt);
    add_source(N,GET_ARR(env,jrw,CENTER_LOC),GET_ARR(env,jrw0,CENTER_LOC),dt);
}

void add_source(int N, float * x, float * s, float dt){
	int i;
    int size=N*N*N;
	for(i=0; i<size; i++){
        x[i] += dt*s[i];
    }
}

/*
 * Solves vector diffusion along all axis
 */
JNIEXPORT void JNICALL Java_electrosphere_FluidSim_solveVectorDiffuse
  (JNIEnv * env,
  jobject this,
  jint N,
  jint chunk_mask,
  jobjectArray jru,
  jobjectArray jrv,
  jobjectArray jrw,
  jobjectArray jru0,
  jobjectArray jrv0,
  jobjectArray jrw0,
  jfloat DIFFUSION_CONST,
  jfloat VISCOSITY_CONST,
  jfloat dt){
    float a=dt*VISCOSITY_CONST*N*N*N;
    float c=1+6*a;
    int i, j, k, l, m;
    float * u = GET_ARR(env,jru,CENTER_LOC);
    float * v = GET_ARR(env,jrv,CENTER_LOC);
    float * w = GET_ARR(env,jrw,CENTER_LOC);
    float * u0 = GET_ARR(env,jru0,CENTER_LOC);
    float * v0 = GET_ARR(env,jrv0,CENTER_LOC);
    float * w0 = GET_ARR(env,jrw0,CENTER_LOC);

    __m256 aScalar = _mm256_set1_ps(a);
    __m256 cScalar = _mm256_set1_ps(c);

    //transform u direction
    for(k=1; k<N-1; k++){
        for(j=1; j<N-1; j++){
            int n = 0;
            //solve as much as possible vectorized
            for(i = 1; i < N-1; i=i+8){
                __m256 vector = _mm256_loadu_ps(&u[IX(i-1,j,k)]);
                vector = _mm256_add_ps(vector,_mm256_loadu_ps(&u[IX(i+1,j,k)]));
                vector = _mm256_add_ps(vector,_mm256_loadu_ps(&u[IX(i,j-1,k)]));
                vector = _mm256_add_ps(vector,_mm256_loadu_ps(&u[IX(i,j+1,k)]));
                vector = _mm256_add_ps(vector,_mm256_loadu_ps(&u[IX(i,j,k-1)]));
                vector = _mm256_add_ps(vector,_mm256_loadu_ps(&u[IX(i,j,k+1)]));
                vector = _mm256_mul_ps(vector,aScalar);
                vector = _mm256_add_ps(vector,_mm256_loadu_ps(&u0[IX(i,j,k)]));
                vector = _mm256_div_ps(vector,cScalar);
                _mm256_storeu_ps(&u[IX(i,j,k)],vector);
            }
            //If there is any leftover, perform manual solving
            if(i>N-1){
                for(i=i-8; i < N-1; i++){
                    u[IX(i,j,k)] = (u0[IX(i,j,k)] + a*(u[IX(i-1,j,k)]+u[IX(i+1,j,k)]+u[IX(i,j-1,k)]+u[IX(i,j+1,k)]+u[IX(i,j,k-1)]+u[IX(i,j,k+1)]))/c;
                }
            }
        }
    }

    //transform v direction
    for(k=1; k<N-1; k++){
        for(j=1; j<N-1; j++){
            int n = 0;
            //solve as much as possible vectorized
            for(i = 1; i < N-1; i=i+8){
                __m256 vector = _mm256_loadu_ps(&v[IX(i-1,j,k)]);
                vector = _mm256_add_ps(vector,_mm256_loadu_ps(&v[IX(i+1,j,k)]));
                vector = _mm256_add_ps(vector,_mm256_loadu_ps(&v[IX(i,j-1,k)]));
                vector = _mm256_add_ps(vector,_mm256_loadu_ps(&v[IX(i,j+1,k)]));
                vector = _mm256_add_ps(vector,_mm256_loadu_ps(&v[IX(i,j,k-1)]));
                vector = _mm256_add_ps(vector,_mm256_loadu_ps(&v[IX(i,j,k+1)]));
                vector = _mm256_mul_ps(vector,aScalar);
                vector = _mm256_add_ps(vector,_mm256_loadu_ps(&v0[IX(i,j,k)]));
                vector = _mm256_div_ps(vector,cScalar);
                _mm256_storeu_ps(&v[IX(i,j,k)],vector);
            }
            //If there is any leftover, perform manual solving
            if(i>N-1){
                for(i=i-8; i < N-1; i++){
                    v[IX(i,j,k)] = (v0[IX(i,j,k)] + a*(v[IX(i-1,j,k)]+v[IX(i+1,j,k)]+v[IX(i,j-1,k)]+v[IX(i,j+1,k)]+v[IX(i,j,k-1)]+v[IX(i,j,k+1)]))/c;
                }
            }
        }
    }

    //transform w direction
    for(k=1; k<N-1; k++){
        for(j=1; j<N-1; j++){
            int n = 0;
            //solve as much as possible vectorized
            for(i = 1; i < N-1; i=i+8){
                __m256 vector = _mm256_loadu_ps(&w[IX(i-1,j,k)]);
                vector = _mm256_add_ps(vector,_mm256_loadu_ps(&w[IX(i+1,j,k)]));
                vector = _mm256_add_ps(vector,_mm256_loadu_ps(&w[IX(i,j-1,k)]));
                vector = _mm256_add_ps(vector,_mm256_loadu_ps(&w[IX(i,j+1,k)]));
                vector = _mm256_add_ps(vector,_mm256_loadu_ps(&w[IX(i,j,k-1)]));
                vector = _mm256_add_ps(vector,_mm256_loadu_ps(&w[IX(i,j,k+1)]));
                vector = _mm256_mul_ps(vector,aScalar);
                vector = _mm256_add_ps(vector,_mm256_loadu_ps(&w0[IX(i,j,k)]));
                vector = _mm256_div_ps(vector,cScalar);
                _mm256_storeu_ps(&w[IX(i,j,k)],vector);
            }
            //If there is any leftover, perform manual solving
            if(i>N-1){
                for(i=i-8; i < N-1; i++){
                    w[IX(i,j,k)] = (w0[IX(i,j,k)] + a*(w[IX(i-1,j,k)]+w[IX(i+1,j,k)]+w[IX(i,j-1,k)]+w[IX(i,j+1,k)]+w[IX(i,j,k-1)]+w[IX(i,j,k+1)]))/c;
                }
            }
        }
    }
}

/*
 * Sets up a projection system of equations
 */
JNIEXPORT void JNICALL Java_electrosphere_FluidSim_setupProjection
  (JNIEnv * env,
  jobject this,
  jint N,
  jint chunk_mask,
  jobjectArray jru,
  jobjectArray jrv,
  jobjectArray jrw,
  jobjectArray jru0,
  jobjectArray jrv0,
  jobjectArray jrw0,
  jfloat DIFFUSION_CONST,
  jfloat VISCOSITY_CONST,
  jfloat dt){
    int i, j, k;

    __m256 xVector = _mm256_set1_ps(N);
    __m256 yVector = _mm256_set1_ps(N);
    __m256 zVector = _mm256_set1_ps(N);
    __m256 constScalar = _mm256_set1_ps(-1.0/3.0);
    __m256 zeroVec = _mm256_set1_ps(0);
    __m256 vector, vector2, vector3;

    float * u = GET_ARR(env,jru,CENTER_LOC);
    float * v = GET_ARR(env,jrv,CENTER_LOC);
    float * w = GET_ARR(env,jrw,CENTER_LOC);

    float * p = GET_ARR(env,jru0,CENTER_LOC);
    float * div = GET_ARR(env,jrv0,CENTER_LOC);

    float scalar = 1.0/3.0;
    float h = 1.0/N;

    for(k=1; k<N-1; k++){
        for(j=1; j<N-1; j++){
            i = 1;
            //
            //lower
            //
            //first part
            vector = _mm256_loadu_ps(&u[IX(i+1,j,k)]);
            vector = _mm256_sub_ps(vector,_mm256_loadu_ps(&u[IX(i-1,j,k)]));
            vector = _mm256_div_ps(vector,xVector);
            //second part
            vector2 = _mm256_loadu_ps(&v[IX(i,j+1,k)]);
            vector2 = _mm256_sub_ps(vector2,_mm256_loadu_ps(&v[IX(i,j-1,k)]));
            vector2 = _mm256_div_ps(vector2,yVector);
            //third part
            vector3 = _mm256_loadu_ps(&w[IX(i,j,k+1)]);
            vector3 = _mm256_sub_ps(vector3,_mm256_loadu_ps(&w[IX(i,j,k-1)]));
            vector3 = _mm256_div_ps(vector3,zVector);
            //multiply and finalize
            vector = _mm256_add_ps(vector,_mm256_add_ps(vector2,vector3));
            vector = _mm256_mul_ps(vector,constScalar);
            //store
            _mm256_storeu_ps(&div[IX(i,j,k)],vector);
            _mm256_storeu_ps(&p[IX(i,j,k)],zeroVec);
            i = 9;
            //
            //upper
            //
            //first part
            vector = _mm256_loadu_ps(&u[IX(i+1,j,k)]);
            vector = _mm256_sub_ps(vector,_mm256_loadu_ps(&u[IX(i-1,j,k)]));
            vector = _mm256_div_ps(vector,xVector);
            //second part
            vector2 = _mm256_loadu_ps(&v[IX(i,j+1,k)]);
            vector2 = _mm256_sub_ps(vector2,_mm256_loadu_ps(&v[IX(i,j-1,k)]));
            vector2 = _mm256_div_ps(vector2,yVector);
            //third part
            vector3 = _mm256_loadu_ps(&w[IX(i,j,k+1)]);
            vector3 = _mm256_sub_ps(vector3,_mm256_loadu_ps(&w[IX(i,j,k-1)]));
            vector3 = _mm256_div_ps(vector3,zVector);
            //multiply and finalize
            vector = _mm256_add_ps(vector,_mm256_add_ps(vector2,vector3));
            vector = _mm256_mul_ps(vector,constScalar);
            //store
            _mm256_storeu_ps(&div[IX(i,j,k)],vector);
            _mm256_storeu_ps(&p[IX(i,j,k)],zeroVec);


            // for(i = 1; i < N - 1; i++){
            //     div[IX(i,j,k)] =
            //     -scalar*h*(u[IX(i+1,j,k)]-u[IX(i-1,j,k)]+
            //     v[IX(i,j+1,k)]-v[IX(i,j-1,k)]+
            //     w[IX(i,j,k+1)]-w[IX(i,j,k-1)]);
            //     p[IX(i,j,k)] = 0;
            // }
        }
    }
}

/*
 * Solves a projection system of equations
 */
JNIEXPORT void JNICALL Java_electrosphere_FluidSim_solveProjection
  (JNIEnv * env,
  jobject this,
  jint N,
  jint chunk_mask,
  jobjectArray jru,
  jobjectArray jrv,
  jobjectArray jrw,
  jobjectArray jru0,
  jobjectArray jrv0,
  jobjectArray jrw0,
  jfloat DIFFUSION_CONST,
  jfloat VISCOSITY_CONST,
  jfloat dt){
    int a = 1;
    int c = 6;
    int i, j, k, l, m;
    __m256 aScalar = _mm256_set1_ps(a);
    __m256 cScalar = _mm256_set1_ps(c);

    float * p = GET_ARR(env,jru0,CENTER_LOC);
    float * div = GET_ARR(env,jrv0,CENTER_LOC);
    // update for each cell
    for(k=1; k<N-1; k++){
        for(j=1; j<N-1; j++){
            int n = 0;
            //solve as much as possible vectorized
            for(i = 1; i < N-1; i=i+8){
                __m256 vector = _mm256_loadu_ps(&p[IX(i-1,j,k)]);
                vector = _mm256_add_ps(vector,_mm256_loadu_ps(&p[IX(i+1,j,k)]));
                vector = _mm256_add_ps(vector,_mm256_loadu_ps(&p[IX(i,j-1,k)]));
                vector = _mm256_add_ps(vector,_mm256_loadu_ps(&p[IX(i,j+1,k)]));
                vector = _mm256_add_ps(vector,_mm256_loadu_ps(&p[IX(i,j,k-1)]));
                vector = _mm256_add_ps(vector,_mm256_loadu_ps(&p[IX(i,j,k+1)]));
                // vector = _mm256_mul_ps(vector,aScalar);
                vector = _mm256_add_ps(vector,_mm256_loadu_ps(&div[IX(i,j,k)]));
                vector = _mm256_div_ps(vector,cScalar);
                _mm256_storeu_ps(&p[IX(i,j,k)],vector);
            }
            //If there is any leftover, perform manual solving
            if(i>N-1){
                for(i=i-8; i < N-1; i++){
                    p[IX(i,j,k)] = (div[IX(i,j,k)] + a*(p[IX(i-1,j,k)]+p[IX(i+1,j,k)]+p[IX(i,j-1,k)]+p[IX(i,j+1,k)]+p[IX(i,j,k-1)]+p[IX(i,j,k+1)]))/c;
                }
            }
            // for(i=1; i < N-1; i++){
            //     p[IX(i,j,k)] = (div[IX(i,j,k)] + a*(p[IX(i-1,j,k)]+p[IX(i+1,j,k)]+p[IX(i,j-1,k)]+p[IX(i,j+1,k)]+p[IX(i,j,k-1)]+p[IX(i,j,k+1)]))/c;
            // }
        }
    }
}

/*
 * Finalizes a projection (subtract curl, set bounds, etc)
 */
JNIEXPORT void JNICALL Java_electrosphere_FluidSim_finalizeProjection
  (JNIEnv * env,
  jobject this,
  jint N,
  jint chunk_mask,
  jobjectArray jru,
  jobjectArray jrv,
  jobjectArray jrw,
  jobjectArray jru0,
  jobjectArray jrv0,
  jobjectArray jrw0,
  jfloat DIFFUSION_CONST,
  jfloat VISCOSITY_CONST,
  jfloat dt){
    int i, j, k;
    // __m256 constScalar = _mm256_set1_ps(0.5f*N);
    __m256 xScalar = _mm256_set1_ps(0.5*N);
    __m256 yScalar = _mm256_set1_ps(0.5*N);
    __m256 zScalar = _mm256_set1_ps(0.5*N);
    __m256 vector, vector2, vector3;

    float * u = GET_ARR(env,jru,CENTER_LOC);
    float * v = GET_ARR(env,jrv,CENTER_LOC);
    float * w = GET_ARR(env,jrw,CENTER_LOC);

    float * p = GET_ARR(env,jru0,CENTER_LOC);
    float * div = GET_ARR(env,jrv0,CENTER_LOC);

    float h = 1.0 / N;

    for ( k=1 ; k<N-1 ; k++ ) {
        for ( j=1 ; j<N-1 ; j++ ) {
            //
            //v
            //
            //lower
            vector =  _mm256_loadu_ps(&p[IX(1+1,j,k)]);
            vector2 = _mm256_loadu_ps(&p[IX(1-1,j,k)]);
            vector =  _mm256_sub_ps(vector,vector2);
            vector = _mm256_mul_ps(vector,xScalar);
            vector = _mm256_sub_ps(_mm256_loadu_ps(&u[IX(1,j,k)]),vector);
            _mm256_storeu_ps(&u[IX(1,j,k)],vector);
            //upper
            vector =  _mm256_loadu_ps(&p[IX(9+1,j,k)]);
            vector2 = _mm256_loadu_ps(&p[IX(9-1,j,k)]);
            vector =  _mm256_sub_ps(vector,vector2);
            vector = _mm256_mul_ps(vector,xScalar);
            vector = _mm256_sub_ps(_mm256_loadu_ps(&u[IX(9,j,k)]),vector);
            _mm256_storeu_ps(&u[IX(9,j,k)],vector);
            //
            //v
            //
            //lower
            vector =  _mm256_loadu_ps(&p[IX(1,j+1,k)]);
            vector2 = _mm256_loadu_ps(&p[IX(1,j-1,k)]);
            vector =  _mm256_sub_ps(vector,vector2);
            vector = _mm256_mul_ps(vector,yScalar);
            vector = _mm256_sub_ps(_mm256_loadu_ps(&v[IX(1,j,k)]),vector);
            _mm256_storeu_ps(&v[IX(1,j,k)],vector);
            //upper
            vector =  _mm256_loadu_ps(&p[IX(9,j+1,k)]);
            vector2 = _mm256_loadu_ps(&p[IX(9,j-1,k)]);
            vector =  _mm256_sub_ps(vector,vector2);
            vector = _mm256_mul_ps(vector,yScalar);
            vector = _mm256_sub_ps(_mm256_loadu_ps(&v[IX(9,j,k)]),vector);
            _mm256_storeu_ps(&v[IX(9,j,k)],vector);
            //
            //w
            //
            //lower
            vector =  _mm256_loadu_ps(&p[IX(1,j,k+1)]);
            vector2 = _mm256_loadu_ps(&p[IX(1,j,k-1)]);
            vector =  _mm256_sub_ps(vector,vector2);
            vector = _mm256_mul_ps(vector,zScalar);
            vector = _mm256_sub_ps(_mm256_loadu_ps(&w[IX(1,j,k)]),vector);
            _mm256_storeu_ps(&w[IX(1,j,k)],vector);
            //upper
            vector =  _mm256_loadu_ps(&p[IX(9,j,k+1)]);
            vector2 = _mm256_loadu_ps(&p[IX(9,j,k-1)]);
            vector =  _mm256_sub_ps(vector,vector2);
            vector = _mm256_mul_ps(vector,zScalar);
            vector = _mm256_sub_ps(_mm256_loadu_ps(&w[IX(9,j,k)]),vector);
            _mm256_storeu_ps(&w[IX(9,j,k)],vector);
            // for(i = 1; i < N-1; i++){
            //     u[IX(i,j,k)] = u[IX(i,j,k)] - 0.5 * (p[IX(i+1,j,k)] - p[IX(i-1,j,k)]) / h;
            //     v[IX(i,j,k)] = v[IX(i,j,k)] - 0.5 * (p[IX(i,j+1,k)] - p[IX(i,j-1,k)]) / h;
            //     w[IX(i,j,k)] = w[IX(i,j,k)] - 0.5 * (p[IX(i,j,k+1)] - p[IX(i,j,k-1)]) / h;
            // }
        }
    }
}

/*
 * Advects u, v, and w
 */
JNIEXPORT void JNICALL Java_electrosphere_FluidSim_advectVectors
(JNIEnv * env,
  jobject this,
  jint N,
  jint chunk_mask,
  jobjectArray jru,
  jobjectArray jrv,
  jobjectArray jrw,
  jobjectArray jru0,
  jobjectArray jrv0,
  jobjectArray jrw0,
  jfloat DIFFUSION_CONST,
  jfloat VISCOSITY_CONST,
  jfloat dt){
    advect(env,chunk_mask,N,1,jru,jru0,GET_ARR(env,jru0,CENTER_LOC),GET_ARR(env,jrv0,CENTER_LOC),GET_ARR(env,jrw0,CENTER_LOC),dt);
    advect(env,chunk_mask,N,2,jrv,jrv0,GET_ARR(env,jru0,CENTER_LOC),GET_ARR(env,jrv0,CENTER_LOC),GET_ARR(env,jrw0,CENTER_LOC),dt);
    advect(env,chunk_mask,N,3,jrw,jrw0,GET_ARR(env,jru0,CENTER_LOC),GET_ARR(env,jrv0,CENTER_LOC),GET_ARR(env,jrw0,CENTER_LOC),dt);
}


void advect(JNIEnv * env, uint32_t chunk_mask, int N, int b, jobjectArray jrd, jobjectArray jrd0, float * u, float * v, float * w, float dt){
    int i, j, k, i0, j0, k0, i1, j1, k1;
    int m,n,o;
    float x, y, z, s0, t0, s1, t1, u1, u0, dtx,dty,dtz;

    dtx=dty=dtz=dt*N;

    float * d = GET_ARR(env,jrd,CENTER_LOC);

    float * d0 = GET_ARR(env,jrd0,CENTER_LOC);

    for(k=1; k<N-1; k++){
        for(j=1; j<N-1; j++){
            for(i=1; i<N-1; i++){
                d0 = GET_ARR(env,jrd0,CENTER_LOC);
                //calculate location to pull from
                x = i-dtx*u[IX(i,j,k)];
                y = j-dty*v[IX(i,j,k)];
                z = k-dtz*w[IX(i,j,k)];

                m = n = o = 1;

                if(x < 0){ m += 1; }
                else if(x >= N){ m -= 1; }
                if(y < 0){ n += 1; }
                else if(y >= N){ n -= 1; }
                if(z < 0){ o += 1; }
                else if(z >= N){ o -= 1; }

                //If the out of bounds coordinate is in bounds for a neighbor chunk, use that chunk as source instead
                if(CK(m,n,o) != CENTER_LOC && ARR_EXISTS(chunk_mask,m,n,o)){

                    // if(i == 1 && j == 1 && k == 1){
                    //     printf("\narr indices: %d %d %d\n\n",m,n,o);
                    // }

                    //cases:
                    //if x = 17.01, m = 2
                    // 17 in current array is 1 in neighbor
                    // 18 in current array is 2 in neighbor
                    // 19 in current array is 3 in neighbor
                    //want to sample neighbor array at 1 & 2
                    //x becomes 1.01, sampling new array (keep in mind that 0 in the new array should contain the current array values)
                    //modification: subtract 16

                    //cases:
                    //if x = 16.99, m = 2
                    // 16 in current array is 0 in neighbor
                    // 17 in current array is 1 in neighbor
                    // 18 in current array is 2 in neighbor
                    // 19 in current array is 3 in neighbor
                    //want to sample current array still
                    //x becomes 1.01, sampling new array (keep in mind that 0 in the new array should contain the current array values)
                    //modification: no modification

                    //if x = 0.01, m = 0
                    // 0 in current array is 16 in neighbor
                    //-1 in current array is 15 in neighbor
                    //-2 in current array is 14 in neighbor
                    //want to sample current array still
                    //x becomes 15.01, sampling new array (keep in mind that 17 in the new array should contain the current array)
                    //modification: no modification

                    //if x = -0.01, m = 0
                    // 0 in current array is 16 in neighbor
                    //-1 in current array is 15 in neighbor
                    //-2 in current array is 14 in neighbor
                    //want to sample -1 & 0, so i0 becomes 15
                    //x becomes 15.99, sampling new array (keep in mind that 17 in the new array should contain the current array)
                    //modification: add 16

                    //if x = -2, m = 0
                    // 0 in current array is 16 in neighbor
                    //-1 in current array is 15 in neighbor
                    //-2 in current array is 14 in neighbor
                    //x becomes 14, sampling new array (keep in mind that 17 in the new array should contain the current array)
                    //modification: add 16


                    // printf("Hit other chunk\n");
                    d0 = GET_ARR(env,jrd0,CK(m,n,o));
                    x = x + CHUNK_NORMALIZE_U[CK(m,n,o)] * (N-2);
                    // printf("%d => %f\n",m,x);
                    y = y + CHUNK_NORMALIZE_V[CK(m,n,o)] * (N-2);
                    z = z + CHUNK_NORMALIZE_W[CK(m,n,o)] * (N-2);
                }

                //clamp location within chunk
                //get indices, and calculate percentage to pull from each index
                if(x < 0.001f){
                    //cases to consider:
                    //m = 0, x = -10
                    //m = 2, x = 0.01
                    x=0.001f;
                    i0=(int)0;
                    i1=1;
                    s0 = 0.999f;
                    s1 = 0.001f;
                } else if(x > N - 1){
                    //cases to consider:
                    //m = 0, x = 17.01
                    //m = 2, x = 20
                    x = N-1;
                    i0=(int)N-2;
                    i1=N-1;
                    s0 = 0.001f;
                    s1 = 0.999f;
                } else {
                    i0=(int)x;
                    i1=i0+1;
                    s1 = x-i0;
                    s0 = 1-s1;
                }

                if(y < 0.001f){
                    //cases to consider:
                    //m = 0, x = -10
                    //m = 2, x = 0.01
                    y=0.001f;
                    j0=(int)0;
                    j1=1;
                    t0 = 0.999f;
                    t1 = 0.001f;
                } else if(y > N - 1){
                    //cases to consider:
                    //m = 0, x = 17.01
                    //m = 2, x = 20
                    y = N-1;
                    j0=(int)N-2;
                    j1=N-1;
                    t0 = 0.001f;
                    t1 = 0.999f;
                } else {
                    j0=(int)y;
                    j1=j0+1;
                    t1 = y-j0;
                    t0 = 1-t1;
                }


                if(z < 0.001f){
                    //cases to consider:
                    //m = 0, x = -10
                    //m = 2, x = 0.01
                    z=0.001f;
                    k0=(int)0;
                    k1=1;
                    u0 = 0.999f;
                    u1 = 0.001f;
                } else if(z > N - 1){
                    //cases to consider:
                    //m = 0, x = 17.01
                    //m = 2, x = 20
                    z = N-1;
                    k0=(int)N-2;
                    k1=N-1;
                    u0 = 0.001f;
                    u1 = 0.999f;
                } else {
                    k0=(int)z;
                    k1=k0+1;
                    u1 = z-k0;
                    u0 = 1-u1;
                }

                // if (x<0.001f) x=0.001f;
                // if (x>N+0.5f) x=N+0.5f;
                // if (y<0.001f) y=0.001f;
                // if (y>N+0.5f) y=N+0.5f;
                // if (z<0.001f) z=0.001f;
                // if (z>N+0.5f) z=N+0.5f;

                //get actual indices
                // i0=(int)x;
                // i1=i0+1;
                // j0=(int)y;
                // j1=j0+1;
                // k0=(int)z;
                // k1=k0+1;

                //calculate percentage of each index
                // s1 = x-i0;
                // s0 = 1-s1;
                // t1 = y-j0;
                // t0 = 1-t1;
                // u1 = z-k0;
                // u0 = 1-u1;

                // if(i0 >= N){
                //     i0 = N - 1;
                // }
                // if(i0 < 0){
                //     i0 = 0;
                // }
                if(j0 >= N){
                    j0 = N - 1;
                }
                // if(j0 < 0){
                //     j0 = 0;
                // }
                if(k0 >= N){
                    k0 = N - 1;
                }
                // if(k0 < 0){
                //     k0 = 0;
                // }
                // if(i1 >= N){
                //     i1 = N - 1;
                // }
                // if(i1 < 0){
                //     i1 = 0;
                // }
                if(j1 >= N){
                    j1 = N - 1;
                }
                // if(j1 < 0){
                //     j1 = 0;
                // }
                if(k1 >= N){
                    k1 = N - 1;
                }
                // if(k1 < 0){
                //     k1 = 0;
                // }
                d[IX(i,j,k)] =
                s0*(
                    t0*u0*d0[IX(i0,j0,k0)]+
                    t1*u0*d0[IX(i0,j1,k0)]+
                    t0*u1*d0[IX(i0,j0,k1)]+
                    t1*u1*d0[IX(i0,j1,k1)]
                )+
                s1*(
                    t0*u0*d0[IX(i1,j0,k0)]+
                    t1*u0*d0[IX(i1,j1,k0)]+
                    t0*u1*d0[IX(i1,j0,k1)]+
                    t1*u1*d0[IX(i1,j1,k1)]
                );
            }
        }
    }
}

JNIEXPORT void JNICALL Java_electrosphere_FluidSim_setBoundsToNeighbors
  (JNIEnv * env,
  jobject this,
  jint N,
  jint chunk_mask,
  jint vector_dir,
  jobjectArray neighborArray){
    int DIM = N;
    float * target = GET_ARR(env,neighborArray,CENTER_LOC);
    float * source;
    for(int x=1; x < DIM-1; x++){
        for(int y = 1; y < DIM-1; y++){
            target[IX(0,x,y)] = vector_dir==BOUND_DIR_U ? -target[IX(1,x,y)] : target[IX(1,x,y)];
        }
    }
    for(int x=1; x < DIM-1; x++){
        for(int y = 1; y < DIM-1; y++){
            target[IX(DIM-1,x,y)] =     vector_dir==BOUND_DIR_U ? -target[IX(DIM-2,x,y)] : target[IX(DIM-2,x,y)];
        }
    }
    for(int x=1; x < DIM-1; x++){
        for(int y = 1; y < DIM-1; y++){
            //((x)+(DIM)*(y) + (DIM)*(DIM)*(z))
            target[IX(x,0,y)] =         vector_dir==BOUND_DIR_V ? -target[IX(x,1,y)] : target[IX(x,1,y)];
            target[IX(x,DIM-1,y)] =     vector_dir==BOUND_DIR_V ? -target[IX(x,DIM-2,y)] : target[IX(x,DIM-2,y)];
            target[IX(x,y,0)] =         vector_dir==BOUND_DIR_W ? -target[IX(x,y,1)] : target[IX(x,y,1)];
            target[IX(x,y,DIM-1)] =     vector_dir==BOUND_DIR_W ? -target[IX(x,y,DIM-2)] : target[IX(x,y,DIM-2)];
        }
    }
    for(int x = 1; x < DIM-1; x++){
        target[IX(x,0,0)]         = (float)(0.5f * (target[IX(x,1,0)] + target[IX(x,0,1)]));
        target[IX(x,DIM-1,0)]     = (float)(0.5f * (target[IX(x,DIM-2,0)] + target[IX(x,DIM-1,1)]));
        target[IX(x,0,DIM-1)]     = (float)(0.5f * (target[IX(x,1,DIM-1)] + target[IX(x,0,DIM-2)]));
        target[IX(x,DIM-1,DIM-1)] = (float)(0.5f * (target[IX(x,DIM-2,DIM-1)] + target[IX(x,DIM-1,DIM-2)]));

        target[IX(0,x,0)] = (float)(0.5f * (target[IX(1,x,0)] + target[IX(0,x,1)]));
        target[IX(DIM-1,x,0)] = (float)(0.5f * (target[IX(DIM-2,x,0)] + target[IX(DIM-1,x,1)]));
        target[IX(0,x,DIM-1)] = (float)(0.5f * (target[IX(1,x,DIM-1)] + target[IX(0,x,DIM-2)]));
        target[IX(DIM-1,x,DIM-1)] = (float)(0.5f * (target[IX(DIM-2,x,DIM-1)] + target[IX(DIM-1,x,DIM-2)]));


        target[IX(0,0,x)] = (float)(0.5f * (target[IX(1,0,x)] + target[IX(0,1,x)]));
        target[IX(DIM-1,0,x)] = (float)(0.5f * (target[IX(DIM-2,0,x)] + target[IX(DIM-1,1,x)]));
        target[IX(0,DIM-1,x)] = (float)(0.5f * (target[IX(1,DIM-1,x)] + target[IX(0,DIM-2,x)]));
        target[IX(DIM-1,DIM-1,x)] = (float)(0.5f * (target[IX(DIM-2,DIM-1,x)] + target[IX(DIM-1,DIM-2,x)]));

    }
    target[IX(0,0,0)]             = (float)((target[IX(1,0,0)]+target[IX(0,1,0)]+target[IX(0,0,1)])/3.0);
    target[IX(DIM-1,0,0)]         = (float)((target[IX(DIM-2,0,0)]+target[IX(DIM-1,1,0)]+target[IX(DIM-1,0,1)])/3.0);
    target[IX(0,DIM-1,0)]         = (float)((target[IX(1,DIM-1,0)]+target[IX(0,DIM-2,0)]+target[IX(0,DIM-1,1)])/3.0);
    target[IX(0,0,DIM-1)]         = (float)((target[IX(0,0,DIM-2)]+target[IX(1,0,DIM-1)]+target[IX(0,1,DIM-1)])/3.0);
    target[IX(DIM-1,DIM-1,0)]     = (float)((target[IX(DIM-2,DIM-1,0)]+target[IX(DIM-1,DIM-2,0)]+target[IX(DIM-1,DIM-1,1)])/3.0);
    target[IX(0,DIM-1,DIM-1)]     = (float)((target[IX(1,DIM-1,DIM-1)]+target[IX(0,DIM-2,DIM-1)]+target[IX(0,DIM-1,DIM-2)])/3.0);
    target[IX(DIM-1,0,DIM-1)]     = (float)((target[IX(DIM-1,0,DIM-2)]+target[IX(DIM-2,0,DIM-1)]+target[IX(DIM-1,1,DIM-1)])/3.0);
    target[IX(DIM-1,DIM-1,DIM-1)] = (float)((target[IX(DIM-1,DIM-1,DIM-2)]+target[IX(DIM-1,DIM-2,DIM-1)]+target[IX(DIM-1,DIM-1,DIM-2)])/3.0);
}

/**
 * This exclusively copies neighbors to make sure zeroing out stuff doesn't break sim
*/
JNIEXPORT void JNICALL Java_electrosphere_FluidSim_copyNeighbors
  (JNIEnv * env,
  jobject this,
  jint N,
  jint chunk_mask,
  jint cx,
  jint vector_dir,
  jobjectArray neighborArray){
    int DIM = N;
    float * target = GET_ARR(env,neighborArray,CENTER_LOC);
    float * source;


    //
    //
    //   PLANES
    //
    //
    if(ARR_EXISTS(chunk_mask,0,1,1)){
        source = GET_ARR(env,neighborArray,CK(0,1,1));
        for(int x=1; x < DIM-1; x++){
            for(int y = 1; y < DIM-1; y++){
                target[IX(0,x,y)] = source[IX(DIM-2,x,y)];
            }
        }
    }

    if(ARR_EXISTS(chunk_mask,2,1,1)){
        source = GET_ARR(env,neighborArray,CK(2,1,1));
        for(int x=1; x < DIM-1; x++){
            for(int y = 1; y < DIM-1; y++){
                target[IX(DIM-1,x,y)] = source[IX(1,x,y)];
            }
        }
    }

    if(ARR_EXISTS(chunk_mask,1,0,1)){
        source = GET_ARR(env,neighborArray,CK(1,0,1));
        for(int x=1; x < DIM-1; x++){
            for(int y = 1; y < DIM-1; y++){
                target[IX(x,0,y)] = source[IX(x,DIM-2,y)];
            }
        }
    }

    if(ARR_EXISTS(chunk_mask,1,2,1)){
        source = GET_ARR(env,neighborArray,CK(1,2,1));
        for(int x=1; x < DIM-1; x++){
            for(int y = 1; y < DIM-1; y++){
                target[IX(x,DIM-1,y)] = source[IX(x,1,y)];
            }
        }
    }

    if(ARR_EXISTS(chunk_mask,1,1,0)){
        source = GET_ARR(env,neighborArray,CK(1,1,0));
        for(int x=1; x < DIM-1; x++){
            for(int y = 1; y < DIM-1; y++){
                target[IX(x,y,0)] = source[IX(x,y,DIM-2)];
            }
        }
    }

    if(ARR_EXISTS(chunk_mask,1,1,2)){
        source = GET_ARR(env,neighborArray,CK(1,1,2));
        for(int x=1; x < DIM-1; x++){
            for(int y = 1; y < DIM-1; y++){
                target[IX(x,y,DIM-1)] = source[IX(x,y,1)];
            }
        }
    }


    //
    //
    //    EDGES
    //
    //
    if(ARR_EXISTS(chunk_mask,0,0,1)){
        source = GET_ARR(env,neighborArray,CK(0,0,1));
        for(int x=1; x < DIM-1; x++){
                target[IX(0,0,x)] = source[IX(DIM-2,DIM-2,x)];
        }
    }

    if(ARR_EXISTS(chunk_mask,2,0,1)){
        source = GET_ARR(env,neighborArray,CK(2,0,1));
        for(int x=1; x < DIM-1; x++){
                target[IX(DIM-1,0,x)] = source[IX(1,DIM-2,x)];
        }
    }

    if(ARR_EXISTS(chunk_mask,0,2,1)){
        source = GET_ARR(env,neighborArray,CK(0,2,1));
        for(int x=1; x < DIM-1; x++){
                target[IX(0,DIM-1,x)] = source[IX(DIM-2,1,x)];
        }
    }

    if(ARR_EXISTS(chunk_mask,2,2,1)){
        source = GET_ARR(env,neighborArray,CK(2,2,1));
        for(int x=1; x < DIM-1; x++){
                target[IX(DIM-1,DIM-1,x)] = source[IX(1,1,x)];
        }
    }

    //
    //

    if(ARR_EXISTS(chunk_mask,0,1,0)){
        source = GET_ARR(env,neighborArray,CK(0,1,0));
        for(int x=1; x < DIM-1; x++){
                target[IX(0,x,0)] = source[IX(DIM-2,x,DIM-2)];
        }
    }

    if(ARR_EXISTS(chunk_mask,2,1,0)){
        source = GET_ARR(env,neighborArray,CK(2,1,0));
        for(int x=1; x < DIM-1; x++){
                target[IX(DIM-1,x,0)] = source[IX(1,x,DIM-2)];
        }
    }

    if(ARR_EXISTS(chunk_mask,0,1,2)){
        source = GET_ARR(env,neighborArray,CK(0,1,2));
        for(int x=1; x < DIM-1; x++){
                target[IX(0,x,DIM-1)] = source[IX(DIM-2,x,1)];
        }
    }

    if(ARR_EXISTS(chunk_mask,2,1,2)){
        source = GET_ARR(env,neighborArray,CK(2,1,2));
        for(int x=1; x < DIM-1; x++){
                target[IX(DIM-1,x,DIM-1)] = source[IX(1,x,1)];
        }
    }

    //
    //

    if(ARR_EXISTS(chunk_mask,1,0,0)){
        source = GET_ARR(env,neighborArray,CK(1,0,0));
        for(int x=1; x < DIM-1; x++){
                target[IX(x,0,0)] = source[IX(x,DIM-2,DIM-2)];
        }
    }

    if(ARR_EXISTS(chunk_mask,1,2,0)){
        source = GET_ARR(env,neighborArray,CK(1,2,0));
        for(int x=1; x < DIM-1; x++){
                target[IX(x,DIM-1,0)] = source[IX(x,1,DIM-2)];
        }
    }

    if(ARR_EXISTS(chunk_mask,1,0,2)){
        source = GET_ARR(env,neighborArray,CK(1,0,2));
        for(int x=1; x < DIM-1; x++){
                target[IX(x,0,DIM-1)] = source[IX(x,DIM-2,1)];
        }
    }

    if(ARR_EXISTS(chunk_mask,1,2,2)){
        source = GET_ARR(env,neighborArray,CK(1,2,2));
        for(int x=1; x < DIM-1; x++){
                target[IX(x,DIM-1,DIM-1)] = source[IX(x,1,1)];
        }
    }


    //
    //
    //     CORNERS
    //
    //

    if(ARR_EXISTS(chunk_mask,0,0,0)){
        source = GET_ARR(env,neighborArray,CK(0,0,0));
        target[IX(0,0,0)] = source[IX(DIM-2,DIM-2,DIM-2)];
    }

    if(ARR_EXISTS(chunk_mask,2,0,0)){
        source = GET_ARR(env,neighborArray,CK(2,0,0));
        target[IX(DIM-1,0,0)] = source[IX(1,DIM-2,DIM-2)];
    }

    if(ARR_EXISTS(chunk_mask,0,2,0)){
        source = GET_ARR(env,neighborArray,CK(0,2,0));
        target[IX(0,DIM-1,0)] = source[IX(DIM-2,1,DIM-2)];
    }

    if(ARR_EXISTS(chunk_mask,2,2,0)){
        source = GET_ARR(env,neighborArray,CK(2,2,0));
        target[IX(DIM-1,DIM-1,0)] = source[IX(1,1,DIM-2)];
    }

    //
    //

    if(ARR_EXISTS(chunk_mask,0,0,2)){
        source = GET_ARR(env,neighborArray,CK(0,0,2));
        target[IX(0,0,DIM-1)] = source[IX(DIM-2,DIM-2,1)];
    }

    if(ARR_EXISTS(chunk_mask,2,0,2)){
        source = GET_ARR(env,neighborArray,CK(2,0,2));
        target[IX(DIM-1,0,DIM-1)] = source[IX(1,DIM-2,1)];
    }

    if(ARR_EXISTS(chunk_mask,0,2,2)){
        source = GET_ARR(env,neighborArray,CK(0,2,2));
        target[IX(0,DIM-1,DIM-1)] = source[IX(DIM-2,1,1)];
    }

    if(ARR_EXISTS(chunk_mask,2,2,2)){
        source = GET_ARR(env,neighborArray,CK(2,2,2));
        target[IX(DIM-1,DIM-1,DIM-1)] = source[IX(1,1,1)];
    }


}