#  this script reconciles R version shown here line by line to Matlab intermediates run by Jeff Id
#  the script is more or less a transliteration of Tapio Schneider with no attempt yet at simplification
#  see #http://www.gps.caltech.edu/~tapio/imputation/regem.m pttls.m
#  companion script puts these into an iteration, but shown slowly here for reference

###FUNCTIONS
 #1. pttls (transliterated from Tapio Schneider)
 #2. anom  - used on many occasions 
 
  pttls= function ( V, d, colA, colB, r ) { #d here is eigenvalue NOT squared eigenvalue (Schneider)
	(na=nrow(V)) #23
	(ma  = ncol(V)) #23  	  #  ma must exceed nd
  	(nd   = length(d)) ; #23
 	 n       = length(colA);           # 23 number of columns of A (number of variables)
	 k       = length(colB);           # 82 number of right-hand sides 
  	nr = length(r); #1: nr appears to be only 1 in relevant cases
 
      # initialize output variables
	Xr = array(0, dim=c(n,k, nr));dim(Xr) # 23 82 1  #assume k = 1
  	Sr = array(0,dim=c(k, k,nr));dim(Sr)  # 82 82 1  #assume k =1  3D in Schneider
    	rho = array(0,dim=c(nr,1)); dim(rho) #  1 1 if (nargout > 2)
    	eta = array(0,dim=c(nr,1)); dim(eta)# 1 1 if (nargout==4) 

      # compute a separate solution for each r # extra since only one r
        for (ir in 1:nr)  { #nr
   	 rc          = r[ir]; #3
    	 V11         = V[colA, 1:rc];# 23 3
    	 V21         = as.matrix(V[colB, 1:rc]);  # 82 3 
    	 Xr[,,ir]  =  V11 %*% solve( t(V11) %*% V11)   %*% t(V21)  # dim(Xr[,,ir]) #23 82

      #  estimated covariance matrix of residuals dB0'*dB0 (up to a scaling factor)
    	V22         = V[colB, (rc+1):na]   #102 82
    	Sr[,,ir]  = V22 %*% diag( d[(rc+1):na]^2) %*% t(V22)  # 1,]    0    0 4.146356    0    0
   
     # alternative formula when all left singular vectors are given (not used)
       	rho[ir]   = sqrt(sum(d[(rc+1):nd]^2)); #  8.4563     # residual norm = norm( [dA0 dB0], 'fro') 
         eta[ir]   = sqrt( ssq(Xr[,,ir]) ) #  0.5372863
           #implements eta[ir]   = norm(Xr[,ir], 'fro'); # solution norm = norm( Xr, 'fro')
	}  # ir loop  

      pttls=list(Xr=Xr,Sr=Sr,rho=rho,eta=eta)
      pttls
     }  #function


 anom= function(x,reference=1961:1990,M=15) {
             tsp0=floor(tsp(x))
             month= 1+round(12*  c(time(x))%%1)
             temp=!is.na(match(reference,tsp0[1]:tsp0[2]))
             if(sum(temp)<M) anom=NA else {
                  K=floor(tsp(x)[1])
                  if( is.null(dim(x))) {
                   x=c( rep(NA,month[1]-1),x ,rep(NA,12- month[length(month)]))
                   test=t(array(x,dim=c(12,length(x)/12)) )
                   m0=apply(test[(reference[temp])-K+1,],2,mean,na.rm=TRUE)
                   test=scale(test,center=m0,scale=FALSE)
                   anom=round(ts(c(t(test)),start=c(K,1),freq=12),2)
                   anom} else {anom=x;
                   for(i in 1:ncol(x)) {test=t(array(x[,i],dim=c(12,nrow(x)/12)) )
                   m0=apply(test[(reference[temp])-K+1,],2,mean,na.rm=TRUE)
                   test=round(scale(test,center=m0,scale=FALSE),2)
                   anom[,i]=c(t(test))}}}
                   anom
                }


###
##LOAD DATA 
###
   #this uses BAS scrape for surface and reverse-engineered Steig recon_aws for AWS. Two surface series need to be deducted (Marion, Gough)
   #version here is not quite file compatible with Steig. See Nic L on Dec 2002 problem
   # data from Terra Nova Bay appears to be same as Mario_Zucchelli #

	download.file("http://www.climateaudit.org/data/steig/Data.tab","Data.tab",mode="wb")
	load("Data.tab")
	download.file("http://www.climateaudit.org/data/steig/Info.tab","Info.tab",mode="wb")
	load("Info.tab")
	download.file("http://www.climateaudit.org/data/steig/recon_aws.tab","recon_aws.tab",mode="wb")
	load("recon_aws.tab")
	dimnames(recon_aws)[[2]]=Info$aws_grid$id

   #surface 
	surf=Data$surface
	surf=window(surf,start=1957,end=c(2006,12)) #ends in 2006 per Steig
	surf <- surf[,-26] #deletes column 26 - Marion
	surf <- surf[,-17] #deletes column 17 - Gough
	dim(surf) #600 42
	sanom=as.matrix(surf)
	for (i in 1:42) sanom[,i]=anom(surf[,i],reference=1957:2007) 
            #Steig probably used different reference period and this can be experimented with later
	(ny=ncol(sanom)) #42

  #AWS reverse engineered (rather than READER scrape)
	anoms =window(recon_aws, start = 1980);dim(anoms) #324 63
	dat_aws = window(Data$aws[,colnames(recon_aws)],start=1980,end=c(2006,12));
	   dim(dat_aws) #324 63
	anoms[is.na(dat_aws)] =NA
	   #apply(anoms,2,mean,na.rm=T)  #some are zero, some aren't
	(nx=ncol(anoms)) #63

   # combine anomalies
	anomalies=ts.union(anoms,sanom)
	dimnames(anomalies)[[2]]=c( dimnames(anoms)[[2]],dimnames(sanom)[[2]])
	dim(anomalies) #600 105

###UPLOAD JEFF VERSIONS
	#library("R.matlab");library("Rcompression")
	#id=list.files("d:/climate/data/steig/jeff");id
	#jeff=list()
	#jeff$V=V=read.csv("d:/climate/data/steig/jeff/V.csv",header=FALSE); dim(V) #105 3 
	#jeff$X=as.matrix( read.csv(file.path("d:/climate/data/steig/jeff","X  input to regem.csv" ),header=FALSE) ); dim(jeff$X) #600 105=63+42
	#jeff$d=read.csv(file.path("d:/climate/data/steig/jeff","d.csv"),header=FALSE);   length(jeff$d) # 3 1 
	#jeff$C=read.csv(file.path("d:/climate/data/steig/jeff","C.csv"),header=FALSE);	   dim(jeff$C) # 105 105
	#jeff$Xmis=readMat(file.path("d:/climate/data/steig/jeff","xmis.mat"))[[1]]  #600 105 
	#jeff$C1=read.csv(file.path("d:/climate/data/steig/jeff","C first iteration.csv" ),header=FALSE)
	#jeff$X1=read.csv(file.path("d:/climate/data/steig/jeff","X first iteration.csv"   ),header=FALSE)
	#jeff$Xafter=read.csv(file.path("d:/climate/data/steig/jeff","Xafter regem.csv"    ),header=FALSE)
	#jeff$dXmis=read.table(file.path("d:/climate/data/steig/jeff","dXmis.dat"    ),header=TRUE)
	#names(jeff$dXmis)=c("iter","p_eff","dXmis","dXmis_ratio")
	#jeff$X2=read.csv(file.path("d:/climate/data/steig/jeff","X second iteration2.csv"   ),header=FALSE)
	#jeff$C2=read.csv(file.path("d:/climate/data/steig/jeff","C second iteration2.csv" ),header=FALSE)
	#jeff$Xmis2=readMat(file.path("d:/climate/data/steig/jeff","xmis2.mat"))[[1]]  #600 105 
	
	download.file("http://www.climateaudit.org/data/steig/jeff.tab","temp.dat",mode="wb");load("temp.dat")

 ##INITIALIZATION  
 	X=anomalies	
          range( round(X,5)-jeff$X, na.rm=T)  
		#-0.005  0.005 matches Jeff version

	X=jeff$X #use Jeff version
	dimnames(X)[[2]]=dimnames(anomalies)[[2]]
	n=nrow(X);p=ncol(X)
	indmis=is.na(X);
	(nmis=sum(indmis)) #39551
	kavlr=rep(list(NA),n)
	kmisr=rep(list(NA),n)
	for( j in 1:n) { kavlr[[j]]= (1:p)[!indmis[j,]];
	  kmisr[[j]]= (1:p)[indmis[j,]]}
		#this sets up missing values on line by line basis
	(dofC=n-1) #599
	(trunc=min( 3,n-1,p)) #3
	(dofS=dofC-trunc) #596
	ssq=function(x) sum(x^2)

 ## INITIAL INFILL
     # center, rescale and start infill with 0s
     # question: is the result sensitive to this start point
	X=jeff$X; dimnames(X)[[2]]=dimnames(anomalies)[[2]]
	X=scale(X,scale=FALSE)  #center first
 	M= attributes(X)$"scaled:center";M[1:5]
           #V1            V2            V3            V4            V5 
	    #-3.589744e-07  1.289103e-01 -6.777778e-06 -3.431854e-02 -1.000000e-06 
	    barplot(M)#perfect 0 for surface; aws a little offcenter
	 #this is the same as apply(X,2,sd)
	
	X[indmis]=0 #zero  page 6

 #STEP ONE
  #setup with svd
 	C=t(X)%*%X/dofC   #this is the same as cov(X)
	  range( (t(X)%*%X)/dofC - cov(X))  #   -7.105427e-15  1.243450e-14
	peff_ave=0
	CovRes=array(0,dim=c(p,p))
	D=sqrt(diag(C)) ;D[1:5]# also sd
	  #0.3593756 1.4782013 1.7781748 0.4211105 1.0387703 
	  #same as apply(X,2,sd) ) and attributes(X)$"scaled:scale"

	C=diag(1/D)%*%C%*% diag(1/D) #this is cor(X) and jeff$C
	  range( C- cor(X) )  # -1.443290e-15  1.554312e-15
	  range(C-jeff$C) #  -6.787791e-06  8.027532e-06
	X=X%*%diag(1/D) #standardize to sd=1

	p_eff=trunc
	svd.ant=svd(C)
	dimnames(svd.ant$v)[[2]]=dimnames(X)[[2]]
	   svd.ant$d[1:5] #  17.222444  9.653170  6.596398  4.626054  3.877604

   #do pttls iteration over 600 time periods
     #placeholders set up
	Xmis=array(0,dim=c(n,p))
	Xerr=array(0,dim=c(n,p))
	B=list()
	S=list()

	for(j in 1:n) {
	 test=pttls( svd.ant$v,svd.ant$d ,colA=kavlr[[j]],colB=kmisr[[j]],r=3)
	 B[[j]]=test$Xr[,,1]; #dim(B) #23 82 
	  row.names(B[[j]])=kavlr[[j]];	  dimnames(B[[j]])[[2]]=kmisr[[j]]
	 S[[j]]=test$Sr[,,1]; #dim(S) #82 82 
	 Xerr[j,kmisr[[j]] ]=dofC/dofS  *  t (sqrt(diag(S[[j]])))
	 Xmis[j,kmisr[[j]] ]= X[j,kavlr[[j]]] %*%B[[j]]
	 CovRes[ kmisr[[j]] , kmisr[[j]] ]= CovRes[ kmisr[[j]] , kmisr[[j]] ]+S[[j]]
	}
 
   ##rescale to original scaling
    # X first step 
	X=X1a=scale(X,center=FALSE,scale=1/D)
 	   #col0=rep(1,600);col0[indmis[,1]]=2
	   #plot(X1a[,1],jeff$X1[,1],col=col0);  abline(0,1)
	   #legend("topleft",fill=1:2,legend=c("present","infilled"))
	     # indmis not right yet: variance very different
    # Xmis
	Xmis=scale(Xmis,center=FALSE,scale=1/D)
	   plot(Xmis[,36],jeff$Xmis[,36]);abline(0,1) #perfect match
	   range(Xmis[,36]-jeff$Xmis[,36]) #   -1.682834e-05  7.824493e-06
	    ##barplot(apply(abs(Xmis-jeff$Xmis),2,max,na.rm=T) ,ylim=c(0,.5)) #  -0.002638405  0.002384520
 
    # Xerr
	Xerr=scale(Xerr,center=FALSE,scale=1/D)

    # C, CovRes
	   range(C-jeff$C) #    -6.787791e-06  8.027532e-06
	   range(cov(X)-diag(D)%*%C%*% diag(D) ) #  -1.154632e-14  7.993606e-15
	C= diag(D)%*%C%*% diag(D)	#rescale to covariance 
	CovRes=diag(D)%*% CovRes %*% diag(D) #rescale

    # dXmis
	dXmis=sqrt( ssq( Xmis[indmis]-X[indmis]))/sqrt(nmis)
        c(dXmis,jeff$dXmis[iter-1,3]) #  1.258116 1.258000 #bingo

    # X second part
	X[indmis]=Xmis[indmis] #update data
	  #plot(X[,36]-jeff$X1[,36],type="h",col=col0,ylim=c(-.5,.5));  #small not wquite right yet
	X=scale(X,scale=FALSE) # #recenter
	Mup=attributes(X)$"scaled:center";round(Mup,3)[1:7]
	   #iter 2: -0.007 -0.095 -0.041 -0.010 -0.094 -0.056 -0.091  0.001 -0.097 -0.008  0.000 
 	M=M+Mup ;round(M,3)[1:7]#updated mean vector
	  #-0.007  0.034 -0.041 -0.044 -0.094 -0.056 -0.208 
	  j=36;plot(X[,j]-jeff$X1[,j],type="h",col=col0,ylim=c(-.5,.5));
	   #this matches perfectly
 	   #barplot( apply(abs(X-jeff$X1),2,max),ylim=c(0,.5)) #perfect

   # C second part
	C=t(X)%*%X/dofC   #cov(X)
	   range(C-jeff$C1) #     -8.113341e-05  3.206414e-04

   #summarize iteration comparisons
	   range(X-jeff$X1) #  -0.0007058679  0.0005054197
	   range(C-jeff$C1) #   -8.113341e-05  3.206414e-04
	   range(Xmis-jeff$Xmis) #    -0.0000974938  0.0001219781
	   dXmis-jeff$dXmis[iter-1,3] #  0.0001155552
 
   #examine coeffiecients 
  	barplot( B[[1]][,1],las=3)
	test=sapply(B, function(A)  c(sum(A>0),sum(A<0) ) )	 #  760 436
	test=t(test)
	test=ts(test,start=c(1957,1),freq=12)
    #GDD(file="d:/climate/images/2009/steig/positive_coefficient)step1.gif",type="gif",w=420,h=300)
	plot.ts(test[,1]/(test[,1]+test[,2]),type="l",ylab="")
	title("Positive Coefficient Proportion - Step 1")
	#dev.off()

      regem[[1]]=list(X=X,C= C,B=B,S=S,dXmis=dXmis)

   ##STEP TWO 
	iter=2
	peff_ave=0
	CovRes=array(0,dim=c(p,p))
	D=sqrt(diag(C)) ;D[1:5]# also sd
	  # 0.3994596 1.7490152 2.1634429 0.4639402 1.8015410

	  range( diag(1/D)%*%C%*% diag(1/D) - cor(X) )  #  -1.776357e-15  2.553513e-15
	X=X%*%diag(1/D) #standardize to sd=1
	C=diag(1/D)%*%C%*% diag(1/D)
	  range(C-cor(X)) #   -1.776357e-15  2.553513e-15

	svd.ant=svd(C)
	p_eff=trunc
	svd.ant$d[1:5] #  34.816604 17.551981 13.261866  2.544238  1.972014

	Xmis=array(0,dim=c(n,p)) #placeholders
	Xerr=array(0,dim=c(n,p))
	B=list()
	S=list()

	for(j in 1:n) {
	 test=pttls( svd.ant$v,svd.ant$d ,colA=kavlr[[j]],colB=kmisr[[j]],r=3)
	 B[[j]]=test$Xr[,,1]; #dim(B) #23 82 
	  row.names(B[[j]])=kavlr[[j]];	  dimnames(B[[j]])[[2]]=kmisr[[j]]
	 S[[j]]=test$Sr[,,1]; #dim(S) #82 82 
	 Xerr[j,kmisr[[j]] ]=dofC/dofS  *  t (sqrt(diag(S[[j]])))
	 Xmis[j,kmisr[[j]] ]= X[j,kavlr[[j]]] %*%B[[j]]
	 CovRes[ kmisr[[j]] , kmisr[[j]] ]= CovRes[ kmisr[[j]] , kmisr[[j]] ]+S[[j]]
	}

  ##rescale to original scaling
    # X first step 
	X=X1a=scale(X,center=FALSE,scale=1/D)
 	     # not final yet
    # Xmis
	Xmis=scale(Xmis,center=FALSE,scale=1/D)
	   plot(Xmis[,36],jeff$Xmis2[,36]);abline(0,1) #perfect match
	   range(Xmis[,36]-jeff$Xmis2[,36]) #   -2.336856e-05  1.459296e-05

    # Xerr
	Xerr=scale(Xerr,center=FALSE,scale=1/D)

    # C, CovRes
	   range(cov(X)-diag(D)%*%C%*% diag(D) ) #  -1.154632e-14  7.993606e-15
	C= diag(D)%*%C%*% diag(D)	#rescale to covariance 
	CovRes=diag(D)%*% CovRes %*% diag(D) #rescale

    # dXmis
	dXmis=sqrt( ssq( Xmis[indmis]-X[indmis]))/sqrt(nmis)
        c(dXmis,jeff$dXmis[iter,3]) #   0.4673552 0.4674000

    # X second part
	X[indmis]=Xmis[indmis] #update data
	X=scale(X,scale=FALSE) # #recenter
	Mup=attributes(X)$"scaled:center";round(Mup,3)[1:7]
	   # -0.004 -0.060 -0.034 -0.008 -0.049 -0.030 -0.052

 	M=M+Mup ;round(M,3)[1:7]#updated mean vector
	  #-0.011 -0.026 -0.075 -0.052 -0.143 -0.086 -0.260 
        j=36;plot(X[,j]-jeff$X2[,j],type="h",col=col0,ylim=c(-.5,.5));
	   #this matches perfectly
 	   #barplot( apply(abs(X-jeff$X1),2,max),ylim=c(0,.5)) #perfect

   # C second part
	C=t(X)%*%X/dofC   #cov(X)
	   range(C-jeff$C2) #     -0.0004164527  0.0004466800


   #summarize iteration comparisons
	   range(X-jeff$X2) #  -0.0005188823  0.0006259449
	   range(C-jeff$C2) #   -0.0004164527  0.0004466800
	   range(Xmis-jeff$Xmis2) #    -9.410853e-05  1.098465e-04
	   dXmis-jeff$dXmis[iter,3] #  -4.476494e-05

   #examine coeffiecients 
  	barplot( B[[1]][,1],las=3)

	test=sapply(B, function(A)  c(sum(A>0),sum(A<0) ) )	 #  760 436
	test=t(test)
	test=ts(test,start=c(1957,1),freq=12)
    GDD(file="d:/climate/images/2009/steig/positive_coefficient_step1.gif",type="gif",w=420,h=300)
	plot.ts(test[,1]/(test[,1]+test[,2]),type="l",ylab="")
	title("Positive Coefficient Proportion - Step 2")
	dev.off()

      regem[[2]]=list(X=X,C= C,B=B,S=S,dXmis=dXmis)