##FUNCTIONS USED IN HANSEN STEP 2 and DIAGNOSTICS
    #saved as www.climateaudit.org/scripts/station/giss/step2.functions.txt

##two-legged adjustment HAnsen style
  #also returns one-legged adjustment and HAnsen flags 
twoleg=function(x) {
	lshort=7;slplim=.1;slpx=.05;slpt=.02
	N=length(x)
	stat=rep(NA,N)
	temp=!is.na(x)
	index=(1:N)[temp]
	fm0=lm(x[temp]~index)
	for (i in  index[5:(length(index)-4)]) {
	  z=data.frame(x, abs( (1:N)-i), (1:N)-i)   ;names(z)=c("y","abs","x")	
	  fm=lm(y~abs+x,data=z)
	  stat[i]=ssq(fm$residuals)	
	  }
	index0=(1:N)[(stat==min(stat,na.rm=TRUE))&!is.na(stat)]
     z=data.frame(x, abs( (1:N)-index0), (1:N)-i)   ;names(z)=c("y","abs","x")	
     fm=lm(y~abs+x,data=z)
     fm0=lm(y~x,data=z)	
	sl1=coef(fm)[3]-coef(fm)[2];sl2=coef(fm)[3]+coef(fm)[2]
	flag=rep(NA,4)
     	flag[1]= 1*((index0<lshort)| ((N-index0)<lshort))	
     	flag[2]=10*( abs(sl1)>slplim)+20*(abs(sl2)>slplim) 	
	flag[3]=100*(abs(sl2-sl1)>slplim)+100*(abs(sl2-sl1)>slpx)
	flag[4]= 1000*( (sl1*sl2<0) & (abs(sl1)>slpt) &(abs(sl2)>slpt) ) 
	iflag=sum(flag)
   	#if(!(iflag==0) & !(iflag==100) ) fm$coefficients[2:3]=fm0$coefficients[2]
     	twoleg=list(fm,index0+tsp(x)[1]-1,iflag,fm0);names(twoleg)=c("model","xmid","flag","fm0")
	twoleg
	}

#Make an annual anomaly Hansen style
	anom=function(x,method="anom") {
		if( (class(x)=="ts")|(class(x)=="mts")) K=floor(tsp(x)[1])
		if( is.null(dim(x))) {test=t(array(x,dim=c(12,length(x)/12)) )
		if(method=="anom") apply(test[(1961:1990)-K+1,],2,mean,na.rm=TRUE)
		if(method=="hansen") m0=round(apply(test,2,mean,na.rm=TRUE),2)
		test=scale(test,center=m0,scale=FALSE)
		anom=ts(c(t(test)),start=c(K,1),freq=12)
		anom} else {anom=x;
		for(i in 1:ncol(x)) {test=t(array(x[,i],dim=c(12,nrow(x)/12)) )
		if(method=="anom") m0=apply(test[(1961:1990)-K+1,],2,mean,na.rm=TRUE)
		if(method=="hansen") m0=round(apply(test,2,mean,na.rm=TRUE),2)
		test=round(scale(test,center=m0,scale=FALSE),2)
		anom[,i]=c(t(test))}}
		anom=list(anom,m0)
		anom
		}
			
### circledist
	#calculates great circle distance  with earth radius 6378.137 km
	#from http://en.wikipedia.org/wiki/Great-circle_distance 2nd formula	
   circledist =function(loc, lat,long,R=6372.795) {
	N=length(lat)	
	if(N==0) circledist=NA else {
	  pi180=pi/180;  
   	  x= abs(long -loc[2])
	  if (N>1) delta= apply( cbind(x %%360,(360-x)%%360),1,min)  *pi180  else delta= min (x %%360,(360-x)%%360) *pi180  
	  loc=loc*pi180; lat=lat*pi180; long=long*pi180
	  theta= 2* asin( sqrt(  sin( (lat- loc[1])/2 )^2 + cos(lat)*cos(loc[1])* (sin(delta/2))^2       ))
	  circledist=R*theta
	  }
	circledist
     }		


###Make rural network for given id 	
   ruralstations=function(id0) {
	  if(is.numeric(id0)) i=id0 else i= (1:7364)[!is.na(match (names(giss.dset1),id0))] 
	   temp_rur=(station_versions$urban=="R")
	   station_versions$dist=NA
	   lat0=stations$lat[i];long0=stations$long[i]
	   temp1= (abs (station_versions$lat-lat0) <10) # & !is.na(station_versions$start_raw)
		#initial screening only by latitude: must be within 10 deg latitude to be within 1000 km
	   test_angle= 10/sin(abs(lat0)*pi180)
	   x=	abs(station_versions$long -long0)
	   delta= apply( cbind(x %%360,(360-x)%%360),1,min) 
  	   temp2=(delta<test_angle) 
		#second screen: within 1000 km by longitude
	   ruralstations=station_versions[temp1&temp2&temp_rur,c("station","version","name","long","lat","alt","dist","urban","lights")]
		#matrix of stations meeting first two screen tests
   	   ruralstations$dist=circledist(loc=c(lat0,long0),lat=ruralstations$lat,long=ruralstations$long)
		#calculate circle distance for all stations in first screen
	   temp=(ruralstations$dist<=1000)&!is.na(ruralstations$dist)
	   if(sum(temp)==0) ruralstations= NA else {
	     ruralstations=ruralstations[temp,]  #restrict to stations within 1000 km
	     temp=(ruralstations$dist<=500)&!is.na(ruralstations$dist)
	     if(sum(temp)>=3) ruralstations=ruralstations[temp,]
   	     ruralstations=ruralstations[order(ruralstations[,"dist"]),]#order by increasing distance
	     ruralstations$weights= 1- ruralstations$dist/1000}
	 ruralstations
	 }

##HANSEN REFERENCE METHOD
  #do HAnsen reference on network of time series
     
hansenref=function(rural) {
	K=nrow(rural)
    #collate rural anom
	anom.rural=NULL
	for (k in 1:K) anom.rural=ts.union(anom.rural,giss.anom[[ paste(rural$version[k]) ]])
	dimnames(anom.rural)[[2]]=rural$version
       		#cross check to binary should be OK

	N=nrow(anom.rural)
	W=t(array(rep(rural$weights,N),dim=c(length(rural$weights),N) ) )
	W[is.na(anom.rural)]=NA
		#matrix of weights: at least 3 stations and station available
	long0=apply(!is.na(anom.rural),2,sum)
		#calculates length of the candidate rural series
      	order1=order(long0,decreasing=TRUE);index=NULL
		#long0[order1] # orders by decreasing length
	delta=NULL
   	fixed=NULL; available= 1:K; 

  #start with the longest series
   if (max(long0)<20) reference=NA else {
   	j0=order1[1]; 
 	reference=anom.rural[,j0] #picks longest series to insert
	reference.seq=reference
	weights1=W[,j0]
	fixed=j0

     #sequentially adds in the rural series 
	for (k in 2:K) {
	  j0=order1[k] #chooses the series to be added
	  Y=cbind(anom.rural[,j0],reference,W[,j0],weights1,reference); 
	  temp=!is.na(Y[,1])&!is.na(Y[,2])
	  #if(sum(temp)>=20) {
	  bias= mean(Y[temp,1])-mean(Y[temp,2])
		#in this case, there is a long separation: what happens then 
	  Y[,1]=Y[,1]-bias  #these have same centers
	  g=function(x) weighted.mean(x[1:2],x[3:4],na.rm=T) 
	  Y[,5]=apply(Y[,1:4],1,g)
	  fixed=c(fixed,j0)
	  reference=Y[,5]
	  delta=c(delta,bias)
	  reference.seq=cbind(reference.seq,reference)
	  weights1=apply(Y[,3:4],1,sum,na.rm=T)
	 }
        } 
	count=ts(apply(!is.na(anom.rural),1,sum),start=tsp(anom.rural)[1])
	hansenref=list(reference,count,anom.rural,W);
	names(hansenref)=c("reference","count","anom","weights")
	hansenref
	}


##FUNCTION TO EMULATE STEP 2 COMBINING   #requires ... 
emulate_dset2=function(id0) {

  #collate urban target data
	if (is.numeric(id0) &(id0<8000)) i=id0 else  i=(1:7364)[(names(giss.dset1)==id0)]
	dset1.monthly=ts( c(t(giss.dset1[[i]][[1]][,2:13])) ,start=c(giss.dset1[[i]][[1]][1,1],1),freq=12)
	dset1.annual= ts(giss.dset1[[i]][[1]][,18], start=giss.dset1[[i]][[1]][1,1])
	dset1.monthly.anom=anom(dset1.monthly,method="hansen")
	dset1.norm=dset1.monthly.anom[[2]];dset1.monthly.anom=dset1.monthly.anom[[1]]
	dset1.annual.anom=hansenq(dset1.monthly.anom,method="ts")[,5]
	dset1.count= ts( apply(!is.na(giss.dset1[[i]][[1]][,14:17] ),1,sum,na.rm=TRUE),start=giss.dset1[[i]][[1]][1,1])
	dset2.monthly=ts( c(t(giss.dset2[[i]][[1]][,2:13])) ,start=c(giss.dset2[[i]][[1]][1,1],1),freq=12)
	dset2.annual= ts(giss.dset2[[i]][[1]][,18], start=giss.dset2[[i]][[1]][1,1])
	dset1.info=stations[i,]

  #collate rural station comparanda
	rural=ruralstations(id0)
	reference0=hansenref(rural)
	count=reference0$count
	reference=reference0$reference

    #reference
	Y=ts.union(dset1.annual,dset1.annual.anom,dset2.annual,dset2.annual,reference,reference,reference,reference,dset1.count,count,reference)
	Y=cbind(Y,c(time(Y)))
	dimnames(Y)[[2]]=c("dset1.book","dset1.anom","dset2.book","delta","reference","adjusted","adjustment","emulation","urban.count","dset1.count","pre_adj","year")
	M2=range(c(time(Y))[!is.na(Y[,"dset2.book"])],na.rm=T)
	Y[,c("delta","adjustment","adjusted","emulation","pre_adj")]=NA
	Y[,"delta"]=Y[,"dset2.book"]-Y[,"dset1.book"]
	#temp0=(Y[,"dset1.count"]>=3)&!is.na(Y[,"dset1.count"])&!is.na(Y[,"dset1.anom"])
	temp1=(Y[,"dset1.count"]>=3)&!is.na(Y[,"dset1.count"])
	temp2=!is.na(Y[,"dset1.anom"])
	M0=range(time(Y)[temp1&temp2])  #range of available
	temp0=(time(Y)>=M0[1])&(time(Y)<=M0[2])
			#seems to ignore fallback below 3
	N3=sum( temp0&temp2);  #number of years with both dset1 has values and count>=3 in range with count>=3
	N3ext=floor(N3*1.5)
	temp4=(Y[,"dset1.count"]>0)&!is.na(Y[,"dset1.count"])&(Y[,"urban.count"]>0)&!is.na(Y[,"urban.count"])

	M1=c(NA,max(c(time(Y))[temp4]) ) #1988 
	M1[1]=M1[2]-N3ext+1;M1

	y=Y[,"reference"]-Y[,"dset1.anom"]
	adj=twoleg(ts(y[temp0],start=M0[1]))
	if ((adj$flag==0)|(adj$flag==100)) x=fitted(adj$model) else x= fitted(adj$fm0) 
		#round(x,1)
	Y[temp0&temp2,"pre_adj"]=round(x,1)
	x=x-x[length(x)]	
	Y[temp0&temp2,"adjustment"]=round(x,1)
	temp1A=(time(Y)>=M1[1])&!is.na(Y[,"dset1.anom"])&(time(Y)<M0[1])
	temp1B=(time(Y)<=M1[2])&!is.na(Y[,"dset1.anom"])&(time(Y)>M0[2])
	Y[temp1A,"adjustment"]=round(x[1],1)
	Y[temp1B,"adjustment"]=round(x[length(x)],1)
	Y[,"adjusted"]=round(Y[,"dset1.anom"]+Y[,"adjustment"],2)

      ##now rebuild monthly series
	Z=ts.union(dset1.monthly,dset1.monthly.anom,dset2.monthly,dset2.monthly,dset2.monthly,dset2.monthly,dset2.monthly)
	dimnames(Z)[[2]]=c("dset1.book","dset1.anom","dset2.book","adjustment","adjusted","emulation","delta")
 	Z[,4:7]=NA
	temp2=(time(Y)>=tsp(Z)[1])&(time(Y)<=tsp(Z)[2])
	x= c(t(array(rep(Y[temp2,"adjustment"],12),dim=c(nrow(Y[temp2,]),12)) ))
	Z[,"adjustment"]=c(x[2:length(x)],NA)
	Z[,"adjusted"]=round(Z[,"adjustment"]+Z[,"dset1.anom"],2)
	X= t(array(Z[,"adjusted"],dim=c(12,nrow(Z)/12) )) 
	Z[,"emulation"]= c(t(scale(X,center= -dset1.norm,scale=FALSE) ))
	Z[,"delta"]=round(Z[,"dset2.book"]-Z[,"dset1.book"],2)
	#hansenq(Z[,"emulation"],method="ts")

	emulate_dset2=list(Y,Z,reference0,rural,M0,M1,M2,adj,dset1.info)
	names(emulate_dset2)=c("annual","monthly","reference","rural","M0","M1","M2","adj","info") 
	emulate_dset2
       }


  extract.log=function(id0){
	index=grep(substr(id0,4,11),paparsghcn.log);
	test=paparsghcn.log[index[1]:min((max(index)+300),length(paparsghcn.log))]
	K=min(grep("possible range",test))
	index=grep(substr(id0,4,11),test);
	test1=test[max(index):K]; 
	a1= c( substr(test1[2],36,45),  substr(test1[seq(3,length(test1)-3,2)],37,46) )
	n1= as.numeric(c( substr(test1[2],32,34), substr(test1[seq(3,length(test1)-3,2)],33,35)))
	test1=data.frame(a1,n1);names(test1)=c("id","length")

	test2= paparslist[c(1,grep(substr(id0,4,11),paparslist))]
	test2[2]=gsub("-"," ",test2[2])
	writeLines(test2,"temp.dat")
	test2=read.table("temp.dat",colClasses=c("character",rep("numeric",13)),skip=1)
	name0=c(scan("temp.dat",n=9,what=""),"M0_start","M0_end","M1_start","M1_end","flag")
	name0[2:3]=c("sl1","sl2")
	names(test2)=name0
	extract.log=list(test1,test2)
	extract.log
     }

##Prior version of HAnsen reference, kept for cross check
hansenref_old=function(rural) {
	K=length(rural$station)
   #collate Centigrade data
	chron=NULL; monthly=NULL
	for (k in 1:K){
	  n=(1:7364)[names(giss.dset1)==rural$station[k]]
	  #test1=read.giss(paste(rural$id[k]),dset=1)[[1]]
	  test1=giss.dset1[[paste(rural$station[k])]][[1]]
	  chron=ts.union(chron, round(ts( test1[,"ANN"],start=test1[1,1]), 2) )
	  monthly=ts.union(monthly, ts( c(t(giss.dset1[[n]][[1]][,2:13])) ,start=c(giss.dset1[[n]][[1]][1,1],1),freq=12))
	  }
	tsp0=tsp(chron)#1881 2008
	dimnames(chron)[[2]]=rural$station
	dimnames(monthly)[[2]]=rural$station
	          #this collects annual network of rural stations using Hansen interpolation

    #collate anomaly version Hansen-annualized
	monthly.anom=monthly; 
	for (i in 1:K) monthly.anom[,i]=anom(monthly[,i],method="hansen")[[1]]
	qtr.anom=rep(list(NA),K)
	for(i in 1:K) qtr.anom[[i]]=hansenq(monthly.anom[,i],method="ts")
	anom.rural=NULL
	for (i in 1:K) anom.rural=ts.union(anom.rural,qtr.anom[[i]][,5])
	dimnames(anom.rural)[[2]]=rural$station
       #cross check to binary should be OK

	N=nrow(anom.rural)
	W=t(array(rep(rural$weights,N),dim=c(length(rural$weights),N) ) )
	W[is.na(anom.rural)]=NA
		#matrix of weights: at least 3 stations and station available
	long0=apply(!is.na(anom.rural),2,sum)
		#calculates length of the candidate rural series
      	order1=order(long0,decreasing=TRUE);index=NULL
		#long0[order1] # orders by decreasing length
	delta=NULL
   	fixed=NULL; available= 1:K; 

   #start with the longest series
      if (max(long0)<20) reference=NA else {
   	j0=order1[1]; 
 	reference=anom.rural[,j0] #picks longest series to insert
	reference.seq=reference
	weights1=W[,j0]
	fixed=j0

     #sequentially adds in the rural series 
	for (k in 2:K) {
	  j0=order1[k] #chooses the series to be added
	  Y=cbind(anom.rural[,j0],reference,W[,j0],weights1,reference); 
	  temp=!is.na(Y[,1])&!is.na(Y[,2])
	  #if(sum(temp)>=20) {
	  bias= mean(Y[temp,1])-mean(Y[temp,2])
		#in this case, there is a long separation: what happens then 
	  Y[,1]=Y[,1]-bias  #these have same centers
	  g=function(x) weighted.mean(x[1:2],x[3:4],na.rm=T) 
	  Y[,5]=apply(Y[,1:4],1,g)
	  fixed=c(fixed,j0)
	  reference=Y[,5]
	  delta=c(delta,bias)
	  reference.seq=cbind(reference.seq,reference)
	  weights1=apply(Y[,3:4],1,sum,na.rm=T)
	 }
        } 
	count=ts(apply(!is.na(anom.rural),1,sum),start=tsp0[1])
	hansenref_old=list(reference,count,anom.rural,W);
	names(hansenref_old)=c("reference","count","anom","weights")
	hansenref_old
     }

#MISC FUNCTIONS

trim=function (chron) {  
	M0=range(time(chron)[!is.na(chron)])
	trim=ts(chron[ (time(chron)>=M0[1])&(time(chron)<=M0[2]) ],start=M0[1])
	trim}

long=function(x){ x=trim(x); M0=range(time(x));long=M0[2]-M0[1]+1;long}

hansen_delta=function(x,reference){
	combine=ts.union(x,reference)
	temp=!is.na(combine[,1])&!is.na(combine[,2])
	m0=apply(combine[temp,],2,mean)
	hansen_delta=m0[1]-m0[2]
	hansen_delta}

ssq=function(x){ssq=sum(x^2,na.rm=TRUE);ssq}


##FUNCTIONS USED FOR DIAGNOSTICS

#this does a simple plot to see if the emulated adjustment matches the book adjustment
   plot1=function(W) {
	plot.ts(W$monthly[,"delta"]-W$monthly[,"adjustment"],ylim=c(-.4,.4),main=paste(W$info$name," dset2 (monthly delta)"))
	}

 #this is a more complicated diagnostic plot (illustrated in CA posts) showing various adjustment versions
   plot2=function(W,log0,sgn0=1) {
	year=c(time(W$annual))
	layout(1);par(mar=c(3,3,2,1))
	plot(year,W$annual[,"reference"]-W$annual[,"dset1.anom"],type="l",ylim=c(-1,1))
	lines(year,W$annual[,"delta"],lty=2,lwd=2)
	lines(year,W$annual[,"adjustment"],lty=3,col=2)
	lines(year,W$annual[,"pre_adj"],lty=2,col=2)
	temp0=(time(W$annual)>=W$M0[1])&(time(W$annual)<=W$M0[2])
	temp2=!is.na(W$annual[,"dset1.anom"])
	lines(year[temp0&temp2],W$adj$model$fitted.values,col=3,lty=1)
	lines(year[temp0&temp2],W$adj$fm0$fitted.values,col=3,lty=1)

	a=log0[[2]];xmid=a$knee
	points(a$knee,sgn0*a$Yknee/10,pch=19,col=5)
	x=W$M0[1]:W$M0[2];
	temp=(x <= a$knee);sum(temp)
	points( x[temp], sgn0*0.1*(a$Yknee + a$sl1*(x[temp]-xmid) ) ,col=5,pch=19,cex=.1) 
	temp=(x >=a$knee);sum(temp)
	points( x[temp], sgn0*0.1*(a$Yknee - a$sl2*(x[temp]-xmid) ) ,col=5,pch=19,cex=.1) 
		#note that this is minus sl2 and plus sl1 
	title(paste(W$info$name," Adjustments"))
	legend(locator(1),fill=c(1,5,3,2),legend=c("Book","Log","Fitted","Adjustment"),cex=.7,box.lwd=0)
	}

  #this shows selected rural stations, comparing to Hansen logs
    ruralcompare=function(W,log0){
	ruralcompare=rep(list(NA),3)
	ruralcompare[[1]]=W$rural
      #compare roster in W$rural to HAnsen log
	a1= log0[[1]]$id
	a2=substr(W$rural$version,4,12)
	ruralcompare[[2]]=c( a2[is.na(match(a2,a1))],
		a1[is.na(match(a1,a2))] )
       #compare lengths 
	 ruralcompare[[3]]=cbind(W$M1,W$M2)
	 dimnames(ruralcompare[[3]])[[2]]=c("W","log0")
	ruralcompare
       }


#STEP 1 FUNCTIONS
	
###2.  hansenq

   ## from monthly data with missing values, this function calculates quarterly and annual averages the "Hansen" way
   ## monthly averages are calculated for all months and also quarterly averages ("normal"); over available dates not a reference period
   ##quarters are DJF, MAM, JJA, SON 
   ## if there are 2 or 3 months in a quarter, the quarterly average is the average anomaly in available months plus the quarterly normal  
   ##if there are 3 or 4 quarters, the 

  hansenq=function(X,method="hansen") {
	if(method=="hansen") year=X$year else year= floor(tsp(X)[1]):floor(tsp(X)[2])
	N=length(year);start0=min(year) #32 18
	if(method=="hansen") X=as.matrix(X[,2:13]) else X=t( array(X,dim=c(12,length(X)/12) ) )
	m0=apply(X,2,mean,na.rm=TRUE)  #calculate mean over all values: possibly different
	q0=  apply( array(m0[c(12,1:11)],dim=c(3,4)),2,mean) #calculate quarterly mean
	q0=c(q0,mean(m0) ) #append annual mean
   	Xanom=scale(X,center=m0,scale=FALSE)   #calculate anomaly version calendar year
	x=c(t(Xanom));x=c(NA,x[1:(length(x)-1)]);Xanom=t(array(x,dim=c(12,N) ) ) #re-shape to Dec-Nov
	Xanom=t(array(t(Xanom),dim=c(3,N*4)) )  #re-shape array in quarter-rows rather than annual-rows
	count=apply(!is.na(Xanom),1,sum) #calculate non-missing values by quarter
	Qanom= apply(Xanom,1,mean,na.rm=TRUE); Qanom[count<2]=NA #calculate anomaly by quarter; set to NA if less than 2 values
	Qanom= t( array(Qanom,dim=c(4,length(Qanom)/4) ) ) #reshape into array of dimension nyear x 4
	count=apply(!is.na(Qanom),1,sum) #calculate non-missing values as with quarters
	Qanom=cbind(Qanom,apply(Qanom,1,mean,na.rm=TRUE)  ); dimnames(Qanom)[[2]]=c("DJF","MAM","JJA","SON","ANN")
	Qanom[count<3,"ANN"]=NA  #calculate annual anomaly as average of available quarters; set to NA if fewer than 3 qtrs available
	Qanom=scale(Qanom,center=-q0,scale=FALSE)  #re-center based on "normals"
	hansenq=ts(Qanom ,start=start0)  #make an annual ts object
	hansenq
	}

	#test=hansenq(station[[i]]);test1=station[[i]][,14:18];data.frame(test,test1)
	#x=ts( c(t(station[[1]][,2:13])),start=c(station[[1]][1,1],1),freq=12);hansenq(x,method="ts")