COSTA: COSt and Termination Analyzer for Java Bytecode
    
    package x10;


import java.util.concurrent.atomic.*; 


public class PLU2C { //(int px, int py, int nx, int ny, int B) {   
    int px; // DZ
    int py; // DZ
    int nx; // DZ
    int ny; // DZ
    int B; // DZ

    public static final int NUM_TESTS=10, LOOK_AHEAD=6, RAND_MAX=100;
    final AtomicIntegerArray pivots; 
    final int bx,by,N;
    /*final*/ boolean C_MODE;
    final Blocks[] A;


    public PLU2C(int px/*_*/, int py/*_*/, int nx/*_*/, int ny/*_*/, int B/*_*/, boolean C_MODE/*_*/) {
    	//property(px_,py_,nx_,ny_,B_);
	bx=nx*px; by=ny*py; N=bx*B; //C_MODE=C_MODE_;
	//assert bx==by;
	pivots = new AtomicIntegerArray(N);
	for(int i=0; i</*pivots.length()*/N; i++) pivots.set(i,i);
	A = new Blocks[px*py];
	for (int i=0; i < px*py; i++) A[i] = new Blocks(nx*ny);
	for(int pi=0; pi<px; ++pi) 
	    for(int pj=0; pj<py; ++pj) 
		for(int i=0; i<nx; ++i) 
		    for(int j=0; j<ny; ++j) 
			A[pord(pi,pj)].z[lord(i,j)] = new Block(i*px+pi, j*py+pj); 
    }
    

    public PLU2C(PLU2C o) {
    	//property(o.px,o.py,o.nx,o.ny,o.B);
    	bx=o.bx; by=o.by; N=o.N;C_MODE=o.C_MODE;
    	pivots = o.pivots;
    	A = new Blocks[px*py];
    	for (int i=0; i<px*py;++i) A[i] = new Blocks(nx*ny);
    	for(int pi=0; pi<px; ++pi) 
	    for(int pj=0; pj<py; ++pj) 
		for(int i=0; i<nx; ++i) 
		    for(int j=0; j<ny; ++j) 
			A[pord(pi,pj)].z[lord(i,j)] = new Block(o.getLocal(pi, pj, i, j));
    }


    //static double format(double v, int precision){
    //int scale=1;   
    //for(int i=0; i<precision; i++) scale *= 10;
    //return ((int)(v*scale))*1.0/scale;
    //}


    static int max(int a, int b) { return a>b?a:b; }


    static double max(double a, double b) { return a>b?a:b;}


    static double fabs(double v){ return v>0?v:-v; }


    static int min(int a, int b) { return a>b?b:a; }


    //static double flops(int n) { return ((4.0*n-3.0)*n-1.0)*n/6.0; }


    boolean in(int x, int low, int high) { return low <= x && x < high;}


    int pord(int i, int j) { return i*py+j;	}


    int lord(int i, int j) { return i*ny+j; }


    Block getBlock(int i, int j) { return A[pord(i%px, j%py)].z[lord(i/px,j/py)]; }


    Block getLocal(int pi, int pj, int ni, int nj) { 
    	return A[pord(pi,pj)].z[lord(ni,nj)]; 
    }


    class Blocks {
    	final Block[] z;
    	Blocks(int n) { z=new Block[n]; }
    }


    //PLU2C applyPivots(boolean lowerOnly) {
    //for(int i=0; i<bx; i++) 
    //for(int j=0; j< (lowerOnly? i : by);  j++) 
    //for(int r=i*B; r<(i+1)*B; r++) {
    //final int target = pivots.get(r);
    //if(r != target) getBlock(i,j).permute(r, target);	
    //}
    //return this;
    //}


    //boolean verify(PLU2C M) {
    //double max_diff = 0.0;
    //for (int i = 0; i < N; i++) {
    //int iB = i%B;
    //for (int j = 0; j < N; j++) {
    //final int I=i/B, J=j/B, jB = j%B;
    //double v = 0.0;
    //int k;
    //for (k=0; k<i && k <= j; k++) {
    //final int K=k/B, kB=k%B;
    //v += M.getBlock(I,K).get(iB, kB) * M.getBlock(K,J).get(kB, jB);
    //}
    //if (k==i && k <= j) {
    //final int K=k/B, kB=k%B;
    //v += M.getBlock(K,J).get(kB, jB);
    //}
    //double diff = fabs(getBlock(I,J).get(iB, jB) - v);
    //max_diff = max(diff, max_diff);
    //}
    //}
    //return (max_diff <= 0.01);
    //}


    public static void main(String[] a) { 
	//System.out.print("PLU2C ");
	//if (a.length < 4) {
	//System.out.println("Usage: PLU2C N b px py [C_Mode] [verify] [libraryFileName] ");
	//return;
	//}
	final int N = a.length;//java.lang.Integer.parseInt(a[0]);
	final int B= a.length;//java.lang.Integer.parseInt(a[1]);
	final int px= a.length;//java.lang.Integer.parseInt(a[2]); 
	final int py= a.length;//java.lang.Integer.parseInt(a[3]);
	//boolean VERIFY = true;
	boolean C_MODE=false;
	//String libraryName = "/vol/x10/vj/libPLUInC.so";
	//if (a.length > 4) C_MODE=java.lang.Boolean.parseBoolean(a[4]);
	//if (a.length > 5) VERIFY=java.lang.Boolean.parseBoolean(a[5]);
	//if (a.length > 6) libraryName = a[6];
	final int nx = N / (px*B), ny = N/(py*B);
	//assert (N % (px*B) == 0 && N % (py*B) == 0);
	//System.out.println("N="+N + " B=" + B + " px=" + px + " py=" + py + " nx=" + nx + " ny="+ny);
	//if (C_MODE) System.load(libraryName);
	final PLU2C orig = new PLU2C(px,py,nx,ny,B, C_MODE);
	int i=0;
	//double[] flops = new double[NUM_TESTS];
	while (i<NUM_TESTS) {
	    PLU2C plu = new PLU2C(orig);
	    //System.gc(); System.out.print("Starting...");
	    //long s = - System.nanoTime();
	    run(plu,nx,ny,px,py); //DZ
    	    //s += System.nanoTime();
	    //flops[i]=format(flops(N)/(s)*1000, 3);
	    //System.out.print(" Time="+(s)/1000000+"ms"+" Rate="+flops[i]+" MFLOPS");
	    //if (VERIFY) {
	    //System.out.print(" (Verifying...");
	    //long v = - System.nanoTime();
	    //boolean correct = new PLU2C(orig).applyPivots(false).verify(plu.applyPivots(true));
	    //v += System.nanoTime();
	    //System.out.print(((v)/1000000) + " ms " + (correct?" ok)":" fail)"));
	    //} 
	    //System.out.println();
	    i++;
	} 
	//double max = 0.0; for (i=0; i < NUM_TESTS; i++) if (max < flops[i]) max=flops[i];
	//System.out.println("Max rate: " + max);
    }


    public static void run(PLU2C o, int nx, int ny, int px, int py) { //DZ
	Conc.finish_begin();
	run_1(o,nx,ny,px,py); //DZ
	Conc.finish_end();
    }

    public static void run_1(PLU2C o, int nx, int ny, int px, int py) { //DZ
	for (int pi=0; pi<px; pi++) {
	    for (int pj=0; pj<py; pj++) {
		Conc.async_begin();
		run_2(o,nx,ny,pi,pj); //DZ
		Conc.async_end();
	    }
	}
    }

    public static void run_2(PLU2C o, int nx, int ny, int pi, int pj) { //DZ
	int startY=0, loopCount=0, startX[]=new int[ny];
	final Block[] myBlocks=o.A[o.pord(pi,pj)].z;
	while(startY < ny) { 
	    //boolean done=false;
	    for (int j=startY; j < min(startY+LOOK_AHEAD, ny) /*&& !done*/; ++j) {
		for (int i=startX[j]; i <nx; ++i) {
		    final Block b = myBlocks[o.lord(i,j)];
		    if (b.ready) {
			if (i==startX[j]) startX[j]++;
		    } /*else done |= b.step(startY, startX)*/;
		    Thread.yield();
		}
	    }
	    //if (startX[startY]==nx) { startY++;}
	    startY++;
	    loopCount++;
	    //if (loopCount % 100000 == 0) System.err.println("w("+pi+","+pj
	    //+ "startY=" + startY + "startX=" + startX + "):" + loopCount);
	}
    }

    
    class Block {//(int/*(0,bx)*/ I, int/*(0,by)*/ J) {
	int I; // DZ
	int J; //DZ
	
	final int maxCount; 
	final double/*[:zeroBased&&rank==1&&rect]*/ A[]=new double[2000]; // DZ
	/*@shared*/ boolean ready = false;
	/*@shared*/ private int count=0; 
	/*@shared*/ int LUCol=-1;
	/*@shared*/ double maxColV;
	/*@shared*/ int maxRow;
	int visitCount;
	private int readyBelowIndex; 	
	private int colMaxCount=0, nextRowToBePermuted;


	Block(final int I, final int J) {
	    //property(I,J);
	    //A = (double/*[:zeroBased&&rank==1&&rect]*/) new double[[0:B*B-1]] (point [i]) { 
	    //double d =Math.random();
	    //return (I==J && i %B==i)? format(20.0*d+10.0/RAND_MAX,4)
	    //: format(10.0*d/RAND_MAX, 4);};			
	    maxCount=min(I,J);
	    readyBelowIndex=I;
	    visitCount=0;
	    nextRowToBePermuted=I*B;
	}


	Block(final Block b) {
	    //property(b.I,b.J); 
	    //A = (double[:zeroBased&&rank==1&&rect]) new double[[0:B*B-1]] (point [i]) { return b.A[i];};
	    maxCount = min(I,J);
	    for(int i=0; i<B*B; i++) A[i] = b.A[i];
	    readyBelowIndex = I;
	    visitCount=0;
	    nextRowToBePermuted=I*B;
	}


	//public String toString() {
	//return "Block("+I+","+J+ (ready? " " : " !") + "ready) count=" +count 
	//+ "/"+maxCount+" rbc=" + readyBelowIndex + " LUCol=" + LUCol 
	//+  " colMaxCount=" + colMaxCount + " #visits=" + visitCount;
	//}
	

	boolean step(final int startY, final int[] startX) {
	    visitCount++;
	    if (count==maxCount) {
		return I<J ? stepIltJ() : (I==J ? stepIeqJ() : stepIgtJ());
	    } else {
		Block IBuddy=getBlock(I, count);
		if (!IBuddy.ready) return false;
		Block JBuddy=getBlock(count,J);
		if (!JBuddy.ready) return false;
		mulsub(IBuddy, JBuddy);
		count++;
		return true;
	    }
	}


	/*nullable<Block>*/ Block nextBlock=null;


	boolean stepIltJ() {
	    final Block diag = getBlock(I,I);
	    if(! diag.ready) return false;
	    while (nextRowToBePermuted < (I+1)*B) {
		if (nextBlock != null) {
		    if (nextBlock.count !=I) return false;
		} else {
		    final int target = pivots.get(nextRowToBePermuted);
		    if (target != nextRowToBePermuted) { 
			nextBlock = getBlock(target/B, J);
			if (nextBlock.count != I) return false;
		    }
		}
		permute(nextRowToBePermuted, pivots.get(nextRowToBePermuted));
		nextRowToBePermuted++; nextBlock=null;
	    }
	    backSolve(diag);
	    return ready = true; 
	}


	boolean stepIgtJ() { 
	    if(LUCol>=0) {
		Block diag = getBlock(J,J);
		if(!diag.ready && !(diag.LUCol > LUCol)) return false;
		lower(diag, LUCol);
		if(LUCol==B-1) ready=true; 
	    }
	    ++LUCol;
	    if(LUCol <= B-1) computeMax(LUCol);
	    return true;
	}


	boolean stepIeqJ() { 
	    if(LUCol==-1) { 
		for(;readyBelowIndex<bx && (getBlock(readyBelowIndex, J).count==J); readyBelowIndex++);
		if(readyBelowIndex < bx) return false;
	    }
	    if(LUCol>=0) {
		if(colMaxCount==0) colMaxCount = I+1;
		Block block;
		for(;colMaxCount<bx && ((block=getBlock(colMaxCount,J)).LUCol >=LUCol); colMaxCount++) {
		    if(fabs(block.maxColV) > fabs(maxColV)) {
			maxRow = block.maxRow;
			maxColV = block.maxColV;
		    }
		}
		if(colMaxCount < bx) return false;
		final int row = I*B+LUCol;
		pivots.set(row, maxRow);
		if(row != maxRow) permute(row, maxRow);
		LU(LUCol);
		if(LUCol==B-1)  ready=true;
	    }
	    ++LUCol;
	    if(LUCol<=B-1)	{
		computeMax(LUCol, LUCol);
		colMaxCount=0;
	    }
	    return true;
	}


	void computeMax(int col) { computeMax(col, 0); }
	
	
	void computeMax(int col, int startRow) {
	    int ord=ord(startRow,col);
	    maxColV = A[ord];
	    maxRow = I*B+startRow;
	    for(int i=startRow+1; i<B; i++) {
		final double a = A[ord++];
		if(fabs(a) > fabs(maxColV)) {
		    maxRow = I*B+i;
		    maxColV = a;
		}
	    }
	    //assert in(maxRow, 0, N);
	}
	

	void permute(int row1, int row2) {
	    final Block b = getBlock(row2/B, J);
	    int ord1=row1%B, ord2=row2%B;
	    for(int j=0; j<B; j++){
		final double v1 = A[ord1], v2 = b.A[ord2];
		A[ord1]=v2; b.A[ord2]=v1;
		ord1+=B; ord2+=B;
	    }
	}


	void lower(final Block diag, final int col) {
	    //if (C_MODE) {
	    //blockLower(this.A, diag.A, col, B, diag.get(col,col));
	    //} else {
	    for(int i=0; i<B; i++) {
		double r=0.0; for(int k=0; k<col; k++) r += get(i,k)*diag.get(k,col);
		int ord = ord(i,col);
		A[ord] = (A[ord] -r)/diag.get(col,col);
	    }
	    //}
	}


	void backSolve(final Block diag) { 
	    //if (C_MODE) 
	    //blockBackSolve(this.A, diag.A, B);
	    //else 
	    for (int i=0; i<B; ++i) { 
		for (int j=0; j<B; ++j) {
		    int iord = i,jord = B*j; 
		    double r=0.0; for (int k=0; k<i; ++k) {
			r += diag.A[iord]*A[jord]; 
			iord +=B; jord++;
		    }
		    A[ord(i,j)] -=r;
		}
	    }
	} 


	void mulsub(final Block left, final Block upper) { 
	    //if (C_MODE) 
	    //blockMulSub(this.A, left.A, upper.A, B);
	    //else 
	    for(int i=0; i<B; ++i) { 
		for(int j=0; j<B; ++j) {
		    int iord = ord(i,0),jord = ord(0,j);
		    double r=0; for(int k=0; k<B; ++k) { 
			r += left.A[iord]*upper.A[jord+k]; 
			iord +=B;
		    }
		    A[ord(i,j)] -=r; 
		}
	    }
	}


	void LU(final int col) { 
	    for (int i = 0; i < B; ++i) { 
		int iord=ord(i,0), jord=ord(0,col), m = min(i,col);
		double r = 0.0; for(int k=0; k<m; ++k) {
		    r += A[iord]*A[jord+k]; 
		    iord +=B;
		}
		A[jord+i] -=r;
		if(i>col) A[jord+i] /= A[jord+col]; 
	    }				
	} 


	int ord(int i, int j) { return i+j*B; } 


	double get(int i, int j) { return A[ord(i,j)];	}


	void set(int i, int j, double v) { A[ord(i,j)] = v; }


	void negAdd(int i, int j, double v) { A[ord(i,j)] -= v; }


	void posAdd(int i, int j, double v) { A[ord(i,j)] += v; }
    }


    //static extern void blockLower(double[.] me, double[.] diag, int col, int B, double diagColCol);
    //static extern void blockBackSolve(double[.] me, double[.] diag, int B);
    //static extern void blockMulSub(double[.] me, double[.] left, double[.] upper, int B);
}