package x10;
import java.util.concurrent.atomic.*;
public class PLU2C { //(int px, int py, int nx, int ny, int B) {
int px; // DZ
int py; // DZ
int nx; // DZ
int ny; // DZ
int B; // DZ
public static final int NUM_TESTS=10, LOOK_AHEAD=6, RAND_MAX=100;
final AtomicIntegerArray pivots;
final int bx,by,N;
/*final*/ boolean C_MODE;
final Blocks[] A;
public PLU2C(int px/*_*/, int py/*_*/, int nx/*_*/, int ny/*_*/, int B/*_*/, boolean C_MODE/*_*/) {
//property(px_,py_,nx_,ny_,B_);
bx=nx*px; by=ny*py; N=bx*B; //C_MODE=C_MODE_;
//assert bx==by;
pivots = new AtomicIntegerArray(N);
for(int i=0; i</*pivots.length()*/N; i++) pivots.set(i,i);
A = new Blocks[px*py];
for (int i=0; i < px*py; i++) A[i] = new Blocks(nx*ny);
for(int pi=0; pi<px; ++pi)
for(int pj=0; pj<py; ++pj)
for(int i=0; i<nx; ++i)
for(int j=0; j<ny; ++j)
A[pord(pi,pj)].z[lord(i,j)] = new Block(i*px+pi, j*py+pj);
}
public PLU2C(PLU2C o) {
//property(o.px,o.py,o.nx,o.ny,o.B);
bx=o.bx; by=o.by; N=o.N;C_MODE=o.C_MODE;
pivots = o.pivots;
A = new Blocks[px*py];
for (int i=0; i<px*py;++i) A[i] = new Blocks(nx*ny);
for(int pi=0; pi<px; ++pi)
for(int pj=0; pj<py; ++pj)
for(int i=0; i<nx; ++i)
for(int j=0; j<ny; ++j)
A[pord(pi,pj)].z[lord(i,j)] = new Block(o.getLocal(pi, pj, i, j));
}
//static double format(double v, int precision){
//int scale=1;
//for(int i=0; i<precision; i++) scale *= 10;
//return ((int)(v*scale))*1.0/scale;
//}
static int max(int a, int b) { return a>b?a:b; }
static double max(double a, double b) { return a>b?a:b;}
static double fabs(double v){ return v>0?v:-v; }
static int min(int a, int b) { return a>b?b:a; }
//static double flops(int n) { return ((4.0*n-3.0)*n-1.0)*n/6.0; }
boolean in(int x, int low, int high) { return low <= x && x < high;}
int pord(int i, int j) { return i*py+j; }
int lord(int i, int j) { return i*ny+j; }
Block getBlock(int i, int j) { return A[pord(i%px, j%py)].z[lord(i/px,j/py)]; }
Block getLocal(int pi, int pj, int ni, int nj) {
return A[pord(pi,pj)].z[lord(ni,nj)];
}
class Blocks {
final Block[] z;
Blocks(int n) { z=new Block[n]; }
}
//PLU2C applyPivots(boolean lowerOnly) {
//for(int i=0; i<bx; i++)
//for(int j=0; j< (lowerOnly? i : by); j++)
//for(int r=i*B; r<(i+1)*B; r++) {
//final int target = pivots.get(r);
//if(r != target) getBlock(i,j).permute(r, target);
//}
//return this;
//}
//boolean verify(PLU2C M) {
//double max_diff = 0.0;
//for (int i = 0; i < N; i++) {
//int iB = i%B;
//for (int j = 0; j < N; j++) {
//final int I=i/B, J=j/B, jB = j%B;
//double v = 0.0;
//int k;
//for (k=0; k<i && k <= j; k++) {
//final int K=k/B, kB=k%B;
//v += M.getBlock(I,K).get(iB, kB) * M.getBlock(K,J).get(kB, jB);
//}
//if (k==i && k <= j) {
//final int K=k/B, kB=k%B;
//v += M.getBlock(K,J).get(kB, jB);
//}
//double diff = fabs(getBlock(I,J).get(iB, jB) - v);
//max_diff = max(diff, max_diff);
//}
//}
//return (max_diff <= 0.01);
//}
public static void main(String[] a) {
//System.out.print("PLU2C ");
//if (a.length < 4) {
//System.out.println("Usage: PLU2C N b px py [C_Mode] [verify] [libraryFileName] ");
//return;
//}
final int N = a.length;//java.lang.Integer.parseInt(a[0]);
final int B= a.length;//java.lang.Integer.parseInt(a[1]);
final int px= a.length;//java.lang.Integer.parseInt(a[2]);
final int py= a.length;//java.lang.Integer.parseInt(a[3]);
//boolean VERIFY = true;
boolean C_MODE=false;
//String libraryName = "/vol/x10/vj/libPLUInC.so";
//if (a.length > 4) C_MODE=java.lang.Boolean.parseBoolean(a[4]);
//if (a.length > 5) VERIFY=java.lang.Boolean.parseBoolean(a[5]);
//if (a.length > 6) libraryName = a[6];
final int nx = N / (px*B), ny = N/(py*B);
//assert (N % (px*B) == 0 && N % (py*B) == 0);
//System.out.println("N="+N + " B=" + B + " px=" + px + " py=" + py + " nx=" + nx + " ny="+ny);
//if (C_MODE) System.load(libraryName);
final PLU2C orig = new PLU2C(px,py,nx,ny,B, C_MODE);
int i=0;
//double[] flops = new double[NUM_TESTS];
while (i<NUM_TESTS) {
PLU2C plu = new PLU2C(orig);
//System.gc(); System.out.print("Starting...");
//long s = - System.nanoTime();
run(plu,nx,ny,px,py); //DZ
//s += System.nanoTime();
//flops[i]=format(flops(N)/(s)*1000, 3);
//System.out.print(" Time="+(s)/1000000+"ms"+" Rate="+flops[i]+" MFLOPS");
//if (VERIFY) {
//System.out.print(" (Verifying...");
//long v = - System.nanoTime();
//boolean correct = new PLU2C(orig).applyPivots(false).verify(plu.applyPivots(true));
//v += System.nanoTime();
//System.out.print(((v)/1000000) + " ms " + (correct?" ok)":" fail)"));
//}
//System.out.println();
i++;
}
//double max = 0.0; for (i=0; i < NUM_TESTS; i++) if (max < flops[i]) max=flops[i];
//System.out.println("Max rate: " + max);
}
public static void run(PLU2C o, int nx, int ny, int px, int py) { //DZ
Conc.finish_begin();
run_1(o,nx,ny,px,py); //DZ
Conc.finish_end();
}
public static void run_1(PLU2C o, int nx, int ny, int px, int py) { //DZ
for (int pi=0; pi<px; pi++) {
for (int pj=0; pj<py; pj++) {
Conc.async_begin();
run_2(o,nx,ny,pi,pj); //DZ
Conc.async_end();
}
}
}
public static void run_2(PLU2C o, int nx, int ny, int pi, int pj) { //DZ
int startY=0, loopCount=0, startX[]=new int[ny];
final Block[] myBlocks=o.A[o.pord(pi,pj)].z;
while(startY < ny) {
//boolean done=false;
for (int j=startY; j < min(startY+LOOK_AHEAD, ny) /*&& !done*/; ++j) {
for (int i=startX[j]; i <nx; ++i) {
final Block b = myBlocks[o.lord(i,j)];
if (b.ready) {
if (i==startX[j]) startX[j]++;
} /*else done |= b.step(startY, startX)*/;
Thread.yield();
}
}
//if (startX[startY]==nx) { startY++;}
startY++;
loopCount++;
//if (loopCount % 100000 == 0) System.err.println("w("+pi+","+pj
//+ "startY=" + startY + "startX=" + startX + "):" + loopCount);
}
}
class Block {//(int/*(0,bx)*/ I, int/*(0,by)*/ J) {
int I; // DZ
int J; //DZ
final int maxCount;
final double/*[:zeroBased&&rank==1&&rect]*/ A[]=new double[2000]; // DZ
/*@shared*/ boolean ready = false;
/*@shared*/ private int count=0;
/*@shared*/ int LUCol=-1;
/*@shared*/ double maxColV;
/*@shared*/ int maxRow;
int visitCount;
private int readyBelowIndex;
private int colMaxCount=0, nextRowToBePermuted;
Block(final int I, final int J) {
//property(I,J);
//A = (double/*[:zeroBased&&rank==1&&rect]*/) new double[[0:B*B-1]] (point [i]) {
//double d =Math.random();
//return (I==J && i %B==i)? format(20.0*d+10.0/RAND_MAX,4)
//: format(10.0*d/RAND_MAX, 4);};
maxCount=min(I,J);
readyBelowIndex=I;
visitCount=0;
nextRowToBePermuted=I*B;
}
Block(final Block b) {
//property(b.I,b.J);
//A = (double[:zeroBased&&rank==1&&rect]) new double[[0:B*B-1]] (point [i]) { return b.A[i];};
maxCount = min(I,J);
for(int i=0; i<B*B; i++) A[i] = b.A[i];
readyBelowIndex = I;
visitCount=0;
nextRowToBePermuted=I*B;
}
//public String toString() {
//return "Block("+I+","+J+ (ready? " " : " !") + "ready) count=" +count
//+ "/"+maxCount+" rbc=" + readyBelowIndex + " LUCol=" + LUCol
//+ " colMaxCount=" + colMaxCount + " #visits=" + visitCount;
//}
boolean step(final int startY, final int[] startX) {
visitCount++;
if (count==maxCount) {
return I<J ? stepIltJ() : (I==J ? stepIeqJ() : stepIgtJ());
} else {
Block IBuddy=getBlock(I, count);
if (!IBuddy.ready) return false;
Block JBuddy=getBlock(count,J);
if (!JBuddy.ready) return false;
mulsub(IBuddy, JBuddy);
count++;
return true;
}
}
/*nullable<Block>*/ Block nextBlock=null;
boolean stepIltJ() {
final Block diag = getBlock(I,I);
if(! diag.ready) return false;
while (nextRowToBePermuted < (I+1)*B) {
if (nextBlock != null) {
if (nextBlock.count !=I) return false;
} else {
final int target = pivots.get(nextRowToBePermuted);
if (target != nextRowToBePermuted) {
nextBlock = getBlock(target/B, J);
if (nextBlock.count != I) return false;
}
}
permute(nextRowToBePermuted, pivots.get(nextRowToBePermuted));
nextRowToBePermuted++; nextBlock=null;
}
backSolve(diag);
return ready = true;
}
boolean stepIgtJ() {
if(LUCol>=0) {
Block diag = getBlock(J,J);
if(!diag.ready && !(diag.LUCol > LUCol)) return false;
lower(diag, LUCol);
if(LUCol==B-1) ready=true;
}
++LUCol;
if(LUCol <= B-1) computeMax(LUCol);
return true;
}
boolean stepIeqJ() {
if(LUCol==-1) {
for(;readyBelowIndex<bx && (getBlock(readyBelowIndex, J).count==J); readyBelowIndex++);
if(readyBelowIndex < bx) return false;
}
if(LUCol>=0) {
if(colMaxCount==0) colMaxCount = I+1;
Block block;
for(;colMaxCount<bx && ((block=getBlock(colMaxCount,J)).LUCol >=LUCol); colMaxCount++) {
if(fabs(block.maxColV) > fabs(maxColV)) {
maxRow = block.maxRow;
maxColV = block.maxColV;
}
}
if(colMaxCount < bx) return false;
final int row = I*B+LUCol;
pivots.set(row, maxRow);
if(row != maxRow) permute(row, maxRow);
LU(LUCol);
if(LUCol==B-1) ready=true;
}
++LUCol;
if(LUCol<=B-1) {
computeMax(LUCol, LUCol);
colMaxCount=0;
}
return true;
}
void computeMax(int col) { computeMax(col, 0); }
void computeMax(int col, int startRow) {
int ord=ord(startRow,col);
maxColV = A[ord];
maxRow = I*B+startRow;
for(int i=startRow+1; i<B; i++) {
final double a = A[ord++];
if(fabs(a) > fabs(maxColV)) {
maxRow = I*B+i;
maxColV = a;
}
}
//assert in(maxRow, 0, N);
}
void permute(int row1, int row2) {
final Block b = getBlock(row2/B, J);
int ord1=row1%B, ord2=row2%B;
for(int j=0; j<B; j++){
final double v1 = A[ord1], v2 = b.A[ord2];
A[ord1]=v2; b.A[ord2]=v1;
ord1+=B; ord2+=B;
}
}
void lower(final Block diag, final int col) {
//if (C_MODE) {
//blockLower(this.A, diag.A, col, B, diag.get(col,col));
//} else {
for(int i=0; i<B; i++) {
double r=0.0; for(int k=0; k<col; k++) r += get(i,k)*diag.get(k,col);
int ord = ord(i,col);
A[ord] = (A[ord] -r)/diag.get(col,col);
}
//}
}
void backSolve(final Block diag) {
//if (C_MODE)
//blockBackSolve(this.A, diag.A, B);
//else
for (int i=0; i<B; ++i) {
for (int j=0; j<B; ++j) {
int iord = i,jord = B*j;
double r=0.0; for (int k=0; k<i; ++k) {
r += diag.A[iord]*A[jord];
iord +=B; jord++;
}
A[ord(i,j)] -=r;
}
}
}
void mulsub(final Block left, final Block upper) {
//if (C_MODE)
//blockMulSub(this.A, left.A, upper.A, B);
//else
for(int i=0; i<B; ++i) {
for(int j=0; j<B; ++j) {
int iord = ord(i,0),jord = ord(0,j);
double r=0; for(int k=0; k<B; ++k) {
r += left.A[iord]*upper.A[jord+k];
iord +=B;
}
A[ord(i,j)] -=r;
}
}
}
void LU(final int col) {
for (int i = 0; i < B; ++i) {
int iord=ord(i,0), jord=ord(0,col), m = min(i,col);
double r = 0.0; for(int k=0; k<m; ++k) {
r += A[iord]*A[jord+k];
iord +=B;
}
A[jord+i] -=r;
if(i>col) A[jord+i] /= A[jord+col];
}
}
int ord(int i, int j) { return i+j*B; }
double get(int i, int j) { return A[ord(i,j)]; }
void set(int i, int j, double v) { A[ord(i,j)] = v; }
void negAdd(int i, int j, double v) { A[ord(i,j)] -= v; }
void posAdd(int i, int j, double v) { A[ord(i,j)] += v; }
}
//static extern void blockLower(double[.] me, double[.] diag, int col, int B, double diagColCol);
//static extern void blockBackSolve(double[.] me, double[.] diag, int B);
//static extern void blockMulSub(double[.] me, double[.] left, double[.] upper, int B);
}