#####################
###THIS IS SCRIPT TO PRODUCE SIMULATIONS, FIGURES AND STATISTICS FOR

#		Stephen McIntyre and Ross McKitrick
#		 Reply to Huybers		


#################################################
#PRE REQUISITES: 
# package sandwich is used in hac-standardized calculation but this affects only one series not used in rest of calcs

###############################################

###############################################
###
##				PART 1
#
##FUNCTIONS REQUIRED FOR NARRATIVE SCRIPT TO PRODUCE SIMULATIONS, FIGURES AND STATISTICS FOR
#			GRL021750
#		 Reply to Huybers		
#		Stephen McIntyre and Ross McKitrick
#		
################################################


#LOAD FUNCTIONS AND PACKAGES
	source("http://www.climateaudit.org/scripts/huybers/functions.txt")

##LOAD COLLATED INFO FROM MBH98
	mbhdata="http://www.climateaudit.org/data/mbh98"
	use0="pairwise.complete.obs"

   #sparse series
	url<-"ftp://ftp.ncdc.noaa.gov/pub/data/paleo/paleocean/by_contributor/mann1998/nhem-sparse.dat"
	g<-read.table(url,skip=1)
	sparse<-ts(g[,2:3],start=g[1,1],end=g[nrow(g),1])
	dimnames(sparse)[[2]]<-c("instr","recon")
	sparse<-sparse[,1]-mean(sparse[(1902-1853):(1980-1853),1])
 	T<-sparse
	calibration<-c(1902,1980)
	verification<-c(1854,1901)

  #temperature pcs
	download.file(file.path(mbhdata,"pc.tab"),"temp.dat",mode="wb");load("temp.dat")  ##pc
	dim(pc) #[1] 92 16 #from 1902 -1993

  # temperature eofs used in reconstruction
	download.file(file.path(mbhdata,"eof.ordermann.tab"),"temp.dat",mode="wb");load("temp.dat")  ##pc
	eof.MBH<-eof;dim(eof) #1082 16

  # temperature eigen-values and choose first 16 as only 16 PCs are used
	download.file(file.path(mbhdata,"tpca-eigenvals.tab"),"temp.dat",mode="wb");load("temp.dat")  ##pc
	lambda<-lambda[1:16,2]
	lambda.MBH<-lambda

  #standard deviations
	download.file(file.path(mbhdata,"gridcell.stdev.ordermann.tab"),"temp.dat",mode="wb");load("temp.dat")  ##pc
	length(gridcell.stdev.ordermann) #1082  #same as mann.scale

  ##GRIDCELL INFORMATION
    # MBH uses 1082 gridcells in their "dense" reconstruction and 219 gridcells in their "sparse" reconstruction specified here
    # load list of MBH "dense" grid-cells
	download.file(file.path(mbhdata,"gridpoints.tab"),"temp.dat",mode="wb");load("temp.dat")  ##pc
	tempmann<-sort(grid2[,5])
	manngrid<-!is.na(match(c(1:2592),tempmann)) 
	ordermann<-order(grid2[,5])
	grid2<-grid2[ordermann,]
	nhtemp<-(grid2[,4]>0)   #logical vector for NH gridcells
	mu<-cos(grid2[,4]*pi/180)  #used for areal weighting
	mu.MBH<-mu

    # mask for sparse subset of 219 cells #mask nowhere specified but determiend by careful inspection of diagram
	download.file(file.path(mbhdata,"mask.edited.tab"),"temp.dat",mode="wb");load("temp.dat")  ##pc
	mask.logical<-!is.na(match(grid2[,5],mask.edited)) #sum(mask.logical) #219 #length(mask.logical)

   ##Original proxy
	url="http://www.nature.com/nature/journal/v430/n6995/extref/PROXY/data1400.txt"
	test=read.table(url) #581 23
	test=ts(test[,2:23],start=1400);N=nrow(test)
	test=extend.persist(test)
	m0=apply(test[(N-78):N,],2,mean);sd0=apply(test[(N-78):N,],2,sd)
	proxy.MBH=ts(scale(test,center=m0,scale=sd0),start=1400)
		#Gaspe is series 9; PC1 is series 16

##NOAMER NETWORK		
	tree<-read.table("http://www.climateaudit.org/data/mbh98/UVA/NOAMER.dat",header=TRUE,sep="\t")
	tree<-ts( tree[,2:ncol(tree)],start=tree[1,1],end=tree[nrow(tree),1])
	tree<-ts( tree[1:581,],start=1400,end=1980)
	tree<-extend.persist(tree)
	tree0<-tree
	dim(tree0) #581 212
	tree<-tree[,!is.na(tree[1,])]
	name1<-dimnames(tree)[[2]]
	dim(tree)#581 70
	name0<-unlist(strsplit(as.character(name1), "\\."))
	name0<-array(name0,dim=c(2,length(name0)/2))[1,]
	name0[67]<-substr(name0[67],1,5) #edits wy005b to wy005
 	
   #IDENTIFY CENSORED SERIES IN MBH98 BACKTO_1400-CENSORED CALCULATION
	url<-"http://www.climateaudit.org/data/mbh98/UVA/TREE/ITRDB/NOAMER"
	roster<-read.table(file.path(url,"noamer-censored.inf"))#192 
	roster2<-read.table(file.path(url,"noamer.inf"))#192 series
	censored.list<-roster2[,2][is.na(match(roster2[,2],roster[,2]))]
	censored.list<-as.character(censored.list)
	length(censored.list) #20
	temp1<-!is.na(match(name1,censored.list)) #20 items


#################
#SOME NOAMER NETWORK STATISTICS 
#################
          ##NUMBER OF SITES BY TREE TYPE AND GRIDCELL
	download.file("http://www.climateaudit.org/data/tree/northamerica.details.tab","temp.dat",mode="wb");load("temp.dat")  
	dim(details) #3457 9
	temp<-!is.na(match(details$id,name0));sum(temp) #70
	test<-jones(details$lat[temp],details$long[temp])
	tapply(test,factor(test),length)
	#590 660 661 662 663 664 732 733 734 735 736 738 741 807 810 812 813 
	#  1   5   4   6   4   1   1  13   9  12   2   2   1   2   3   3   1 

	tapply(details$type[temp],factor(details$type[temp]),length)
	#JUOC JUSC PCEN PIAR PIBA PIED PIFL PIJE PILO PIMO PIPO PISF PSME TADI 
	#   6    1    2    5    6    2    7    1   13    3    3    1   10   10 

	
#################################
##  REPLY TO HUYBERS FIGURE 1: DIFFERENT PC ANALYSES WITH VARIOUS SCALING METHODS
####################################

	pc1=ts(array(NA,dim=c(581,9)),start=1400)
	dimnames(pc1)[[2]]=c("MBH","cov","cor","hac","censored","mean","mean1400","nobristle","area")

 #1. MANNOMATIC
	pc1[,"MBH"]=proxy.MBH[,16]

 #5. CENSORED 
 	censored.PC1<-read.table("http://www.climateaudit.org/data/mbh98/UVA/TREE/ITRDB/NOAMER/BACKTO_1400-CENSORED/pc01.out")
	pc1[,"censored"]= scale(censored.PC1[,2])

 #2. Covariance
	pc1[,"cov"]=scale(svd( scale(tree,center=TRUE,scale=FALSE))$u[,1])

 #3. Correlation
	pc1[,"cor"]=scale(svd( scale(tree))$u[,1])

 #4. HAC (Heteroskedasticity-Autocorrelation) CALCULATIONS
	library(sandwich)
	hac.sd<-apply(tree,2,hac)
	pc1[,"hac"]= scale( svd ( scale(tree,scale=hac.sd) )$u[,1]) 

 #6.  AD1400 Network Mean
	pc1[,"mean1400"]= scale(apply(tree,1,mean))

  #7. Full Network MEan
	pc1[,"mean"]= scale(apply(ts(scale(tree0),start=1400),1,mean,na.rm=TRUE)   )
 
  #8. no bristle 
	pc1[,"nobristle"]=scale(svd( scale(tree[,!temp1]))$u[,1])

 #AREA WEIGHTED
	test<-jones(details$lat[temp],details$long[temp])
	fred<-details[temp,c(1,3,4,5)]
	fred$jones<-factor(test)
	fred$count<-fred$jones
	levels(fred$count)<-c(tapply(test,factor(test),length))
	fred$weight<-1/as.numeric(levels(fred$count))[fred$count]
	tree.areaweighted<-scale(scale(tree),scale=1/sqrt(fred$weight),center=FALSE)
	fred$speciescount<-factor(fred$type)
	levels(fred$speciescount)<-tapply(fred$speciescount,fred$speciescount,length)
	fred$speciescount<-as.numeric(levels(fred$speciescount))[fred$speciescount]
	pca.areaweighted<-prcomp(tree.areaweighted)
	  #ts.plot(scale(pca.areaweighted$x[,1]) )
		#not plotted, but similar to covariance


### NOW PLOT FIGURE
  # postscript(file.path("d:/climate/data/huybers","figure1.amended2.eps"), width = 3.3, height =4.2,
  #          horizontal = FALSE, onefile = TRUE, paper = "special",
  #          family = "Helvetica",bg="white",pointsize=8)

	nf <- layout(array(1:8,dim=c(4,2)),heights=c(1.2,1,1,1.4))

  #MBH98 PC1
	par(mar=c(0,4,2,0)) #MBH98
	plot(1400:1980,pc1[,1],type="h",ylab="",axes=FALSE,xlab="")#,ylim=c(-6.1,4))
	abline(h=0,lty=2)
	axis(side=2,las=2,cex.axis=1.25,font=2)#,at=seq(-3,1),labels=c("","-2","","0",""))
	axis(side=1,labels=FALSE)
	box()
	text(1400,2,cex=1.4,"a",pos=4,font=2)

   ##covariance PC1
	par(mar=c(0,4,0,0))
	plot(1400:1980,pc1[,"cov"],axes=FALSE,type="h",ylab="",xlab="",font.lab=2,cex.lab=1.25,ylim=c(-3.0,3.0))
	axis(side=2,at=seq(-6,2,2),las=2,cex.axis=1.25,font=2)
	abline(h=0,lty=2)
	axis(side=1,labels=FALSE)
	box()
	text(1400,2.5,cex=1.4,"b",pos=4,font=2)

  ##correlation PC1
	par(mar=c(0,4,0,0))
	plot( 1400:1980,pc1[,"cor"],axes=FALSE,type="h",ylab="",xlab="",font.lab=2,cex.lab=1.25,ylim=c(-3.0,3.0))
	axis(side=2,at=seq(-6,2,2),las=2,cex.axis=1.25,font=2)
	abline(h=0,lty=2)
	axis(side=1,labels=FALSE)
	box()
	text(1400,2.5,cex=1.4,"c",pos=4,font=2)

  ##hac PC1
	par(mar=c(4,4,0,0))
	plot( 1400:1980,pc1[,"hac"],axes=FALSE,type="h",ylab="",xlab="",font.lab=2,cex.lab=1.25,ylim=c(-3.0,3.0))
	axis(side=2,at=seq(-6,2,2),las=2,cex.axis=1.25,font=2)
	abline(h=0,lty=2)
	axis(side=1,cex.axis=1.25,at=seq(1400,2000,100),labels=c("","1500","","1700","","1900",""),font=2)
	box()
	text(1400,2.5,cex=1.4,"d",pos=4,font=2)

  ##CENSORED PC1	
 	par(mar=c(0,0,2,3))
	plot( 1400:1980,pc1[,"censored"],axes=FALSE,type="h",ylab="",xlab="",font.lab=2,cex.lab=1.25,ylim=c(-3.0,3.0))
	axis(side=4,at=seq(-6,2,2),las=2,cex.axis=1.25,font=2)
	abline(h=0,lty=2)
	axis(side=1,labels=FALSE)
	box()
	text(1400,2.5,cex=1.4,"e",pos=4,font=2)

  ##network mean	
 	par(mar=c(0,0,0,3))
	plot( 1400:1980,pc1[,"mean"],axes=FALSE,type="h",ylab="SD Units",xlab="",font.lab=2,cex.lab=1.25,ylim=c(-3.0,3.0))
	axis(side=4,at=seq(-6,2,2),las=2,cex.axis=1.25,font=2)
	abline(h=0,lty=2)
	axis(side=1,labels=FALSE)
	box()
	text(1400,2.5,cex=1.4,"f",pos=4,font=2)

  ##AD1400 subset mean	
 	par(mar=c(0,0,0,3))
	plot( 1400:1980,pc1[,"mean1400"],axes=FALSE,type="h",ylab="SD Units",xlab="",font.lab=2,cex.lab=1.25,ylim=c(-3.0,3.0))
	axis(side=4,at=seq(-6,2,2),las=2,cex.axis=1.25,font=2)
	abline(h=0,lty=2)
	axis(side=1,labels=FALSE)
	box()
	text(1400,2.5,cex=1.4,"g",pos=4,font=2)

  ##no bristlecone
 	par(mar=c(4,0,0,3))
	plot(1400:1980,pc1[,"nobristle"],axes=FALSE,type="h",ylab="SD Units",xlab="",font.lab=2,cex.lab=1.25,ylim=c(-3.0,3.0))
	axis(side=4,at=seq(-6,2,2),las=2,cex.axis=1.25,font=2)
	abline(h=0,lty=2)
	axis(side=1,cex.axis=1.25,at=seq(1400,2000,100),labels=c("","1500","","1700","","1900",""),font=2)
	box()
	text(1400,2.5,cex=1.4,"h",pos=4,font=2)
   #  dev.off()


######################################################
###
##		PART 2
###	
###            BENCHMARKING THE REDUCTION OF ERROR STATISTIC FOR THE MBH98 ALGORITHM
####%%%%#########################################################


  ##LOAD SIMULATED PC1s from MM2005
	download.file("http://www.climateaudit.org/data/mm05/sim.1.tab","temp.dat",mode="wb")
	load("temp.dat") #this is object named Eigen0 that as third item has 1000 simulated PC1s
		#this is a roster of simulated PC1s #Eigen0
	dim(Eigen0[[3]])  #581 1000
 
  ##NOMERNCLATURE OF MBH98 DATA FOR SIMULATIONS
	selection=1  #only one PC retained in AD1400 network
	U<-as.matrix(pc[1:79,selection])
	Ut<-t(U)
 	G<- t( rep(NA,22)) 
	P<-diag (rep(1,22))  
	sd.obs<-sd(pc[1:79,1])
	sd.rescaling=gridcell.stdev.ordermann	
	beta<-   lambda[1]  * t(eof) [1,mask.logical] %*% (gridcell.stdev.ordermann[mask.logical]*sd.rescaling[mask.logical])/sum(mu[mask.logical])		
	beta#  5.207708	

  ##PLACEMARK FOR SIMULATION
	K<-1000 ## used in actual simulation K=100 for brevity here
	stat<-data.frame(array(rep(NA),dim=c(K,10)))
	names(stat)[1:7]<-c( "RE.cal","RE.ver","R2.cal","R2.ver","CE","sign.test","prod.test")
		#postscript: much of the linear algebra cancels and I would do this much more simply now;
		#this code reconciles to Wahl and Ammann code apples and apples - see climateaudit.org May 2005

	  for (i in 1:K) {
		proxy<-ts(array(rnorm(22*581),dim=c(581,22)),start=1400,end=1980)
			#network of 22 series of white noise of length 581
		proxy[,1]<-Eigen0[[3]][,i]  #simulated HS series from MM2005
		for (j in 1:22) { proxy[,j]<-normalize(proxy[,j],1902,1980)}
		B<-proxy[503:581,]
		G<- as.matrix(  solve(Ut %*% U ) %*% Ut %*% B );  #this is array of regression coefficients in calibration
		C <- P %*%  ( t(G %*% P) %*% solve ((G %*% P) %*% t((G %*%P)))   )
	#calculate reconstructed temperature principal components #this is somewhat redundant
		H<- proxy %*% C 
		sd.est<-sd(H[503:581]) #adjusted May 2005
		H<-H*sd.obs/sd.est
		   #V<- (H * lambda[1])   %*% t(eof) [1,]    #dimension 112x1082 #allows for vector form
		   #sparse.fitted<- V[,mask.logical] %*% (gridcell.stdev.ordermann[mask.logical]*sd.rescaling[mask.logical])/sum(mu[mask.logical])		
		   #sparse.fitted<-ts(sparse.fitted,start=1400,end=1980)
		sparse.fitted<-ts(H*c(beta),start=1400,end=1980) #replces above 3 lines as algebra cancels
		stat[i,1:7]<-verification.stats(sparse.fitted,T,calibration,verification)
		}
	
   ##COLLECT QUANTILE RESULTS
	f=function(x) quantile(x,c(.99,.95,.9,.5))
	sim.results=apply(stat[,1:7],2,f)
	round(sim.results,3)

      ##RE-RUN RESULTS
#    RE.cal RE.ver R2.cal R2.ver     CE sign.test prod.test
#99%  0.309  0.541  0.316  0.132 -0.077        32     2.296
#95%  0.239  0.470  0.239  0.085 -0.242        29     1.682
#90%  0.200  0.429  0.201  0.055 -0.340        28     1.333
#50%  0.060 -0.069  0.092  0.010 -1.506        23    -0.064
	
    ##ORIGINAL RESULTS
	#  RE.cal RE.ver R2.cal R2.ver CE      sign.test prod.test
	#99%  NA  0.542  NA    0.115 -0.096        0.667     2.469
	#95%  NA  0.454  NA    0.073 -0.280        0.625     1.698
	#90%  NA  0.397  NA    0.057 -0.413        0.583     1.361
	#50%  NA -0.074  NA    0.010 -1.517        0.479     0.029

################################## 	
###ADDITIONAL CALCULATIONS AT CLIMATEAUDIT
################################

#A. CALCULATE RE AND OTHER STATS WITH TECH STOCK PC1 PLUS WHITE NOISE

   #obtain PC1 from Tech stock prices 
	price<-read.table("http://www.climateaudit.org/data/mm05/technology.stock.prices.txt",skip=1,header=TRUE,fill=TRUE,sep="\t")
	dim(price)# [1] 2259  159
	id<-scan("d:/climate/data/ritson/technology.stock.prices.txt",n=20,what="")
	name0<-c("date","open","high","low","close","volume","adj.close","X")
	test <-c(t(  outer(id,name0,  function(x,y) paste(x,y,sep=".") )   ))
	names(price)<-test[1:(length(test)-1)]
	price<-price[nrow(price):1,c(1, seq(5,8*19+5,8))]
	test<-price[1410:2000,2:ncol(price)]
	test<-ts(test[1:581,!is.na(test[100,])],start=1400)
	pca.mannomatic.tech<-prcomp(mannomatic(test) ,center=FALSE) 

   #do MBH-style recons making up balance nework with white noise 
	K<-100
	stat<-data.frame(array(rep(NA),dim=c(K,7)))
	names(stat)[1:7]<-c( "RE.cal","RE.ver","R2.cal","R2.ver","CE","sign.test","prod.test")

 	for (i in 1:K) {
		proxy<-ts(array(rnorm(22*581),dim=c(581,22)),start=1400,end=1980)
		proxy[,1]<-pca.mannomatic.tech$x[,1]

		for (j in 1:22) { proxy[,j]<-normalize(proxy[,j],1902,1980)}
		 B<-proxy[503:581,]
		 G<- as.matrix(  solve(Ut %*% U ) %*% Ut %*% B );  #this is array of regression coefficients in calibration
		 C <- P %*%  ( t(G %*% P) %*% solve ((G %*% P) %*% t((G %*%P)))   )
		 H<- proxy %*% C 
		 sd.est<-sd(H[503:581]) #adjusted May 2005
		 H<-H*sd.obs/sd.est
		 sparse.fitted<-ts(H*c(beta),start=1400,end=1980) #replces above 3 lines as algebra cancels
		 stat[i,]<-verification.stats(sparse.fitted,T,c(1902,1980),c(1854,1901))
		}

	f=function(x) quantile(x,c(.99,.95,.9,.5))
	sim.results.tech=apply(stat,2,f)
 	round(sim.results.tech,3)

#RE-RUN
#    RE.cal RE.ver R2.cal R2.ver     CE sign.test prod.test
#99%  0.300  0.509  0.305  0.117 -0.151     31.03     2.449
#95%  0.264  0.460  0.266  0.094 -0.267     29.00     1.981
#90%  0.257  0.419  0.258  0.067 -0.361     29.00     1.636
#50%  0.186  0.328  0.188  0.015 -0.575     24.00    -0.089

##############  
#B. DO WITH MBH PC1/GASPE BLEND AND BALANCE WHITE NOISE
	x=apply(proxy.MBH[,c(9,16)],1,mean)
	K<-100
	stat<-data.frame(array(rep(NA),dim=c(K,7)))
	names(stat)[1:7]<-c( "RE.cal","RE.ver","R2.cal","R2.ver","CE","sign.test","prod.test")

 	for (i in 1:K) {
		proxy<-ts(array(rnorm(22*581),dim=c(581,22)),start=1400,end=1980)
		proxy[,1]<-x
		for (j in 1:22) { proxy[,j]<-normalize(proxy[,j],1902,1980)}
		 B<-proxy[503:581,]
		 G<- as.matrix(  solve(Ut %*% U ) %*% Ut %*% B );  #this is array of regression coefficients in calibration
		 C <- P %*%  ( t(G %*% P) %*% solve ((G %*% P) %*% t((G %*%P)))   )
		 H<- proxy %*% C 
		 sd.est<-sd(H[503:581]) #adjusted May 2005
		 H<-H*sd.obs/sd.est
		 sparse.fitted<-ts(H*c(beta),start=1400,end=1980) #replces above 3 lines as algebra cancels
		 stat[i,]<-verification.stats(sparse.fitted,T,c(1902,1980),c(1854,1901))
		}

	sim.results.wn=apply(stat,2, function(x) quantile(x,c(.99,.95,.9,.5)))
	round(sim.results.wn,3)

#RERUN
#    RE.cal RE.ver R2.cal R2.ver     CE sign.test prod.test
#99%  0.213  0.586  0.213  0.117  0.030     30.01     2.698
#95%  0.200  0.575  0.201  0.093  0.005     28.00     2.341
#90%  0.188  0.559  0.190  0.081 -0.033     27.10     1.938
#50%  0.127  0.501  0.139  0.026 -0.171     23.00     1.147


##############  
#B. DO WITH BALANCE PROXIES ONLY

 	proxy<-proxy.MBH[,c(1:8,10:15,17:22)]
	P=diag(rep(1,ncol(proxy)))
	for (j in 1:ncol(proxy)) { proxy[,j]<-normalize(proxy[,j],1902,1980)}
		 B<-proxy[503:581,]
		 G<- as.matrix(  solve(Ut %*% U ) %*% Ut %*% B );  #this is array of regression coefficients in calibration
		 C <- P %*%  ( t(G %*% P) %*% solve ((G %*% P) %*% t((G %*%P)))   )
		 H<- proxy %*% C 
		 sd.est<-sd(H[503:581]) #adjusted May 2005
		 H<-H*sd.obs/sd.est
		 sparse.fitted<-ts(H*c(beta),start=1400,end=1980) #replces above 3 lines as algebra cancels

	verification.stats(sparse.fitted,T,c(1902,1980),c(1854,1901))
#     RE.cal     RE.ver    R2.cal       R2.ver        CE sign.test prod.test
#1 0.1221319 -0.1300386 0.1347229 5.612416e-05 -1.648948        26  1.165787