##EMULATION OF HANSEN STEP 1
  #this is the combination of combining dset=0 versions to make a dset=1 version

##TEXT

#Two records are combined as shown in Figure 2, if they have a period of overlap. The mean
#difference or bias between the two records during their period of overlap (dT) is used to adjust one
#record before the two are averaged, leading to identification of this way for combining records as the
#“bias” method (HL87) or, alternatively, as the “reference station” method [Peterson et al., 1998b].

#The adjustment is useful even with records for nominally the same location, as indicated by the
#latitude and longitude, because they may differ in the height or surroundings of the thermometer, in
#their method of calculating daily mean temperature, or in other ways that influence monthly mean
#temperature. Although the two records to be combined are shown as being distinct in Figure 2, in the
#majority of cases the overlapping portions of the two records are identical, representing the same
#measurements that have made their way into more than one data set.

#A third record for the same location, if it exists, is then combined with the mean of the first
#two records in the same way, with all records present for a given year contributing equally to the
#mean temperature for that year (HL87). This process is continued until all stations with overlap at
#a given location are employed. 

#If there are additional stations without overlap, these are also
#combined, without adjustment, provided that the gap between records is no more than 10 years and
#the mean temperatures for the nearest five year periods of the two records differ by less than one
#standard deviation. Stations with larger gaps are treated as separate records.

##there are several functions described below:
  	#hansen_bias
 	#hansenq
	#recent
	#overlap_hansen
	#emulate_dset1

#following are some tests

##################
##FUNCTIONS
###################

###LOAD ANCILLARY FUNCTIONS
	source("http://www.climateaudit.org/scripts/station/hansen/read.giss.txt")
  		#this loads the read.giss function used for downloading dset=0 and dset=1 series

 #requires provenance
	url="http://www.climateaudit.org/data/giss"
	provenance=NULL;stat=rep(NA,3)
	id=c("mcdw","ushcn","sumofday")	;K=length(id)
	for (i in 1:K) {
	 loc=file.path(url,paste(id[i],"tbl",sep="."))
	 test=read.table(loc)
	 test=data.frame(test);names(test)=c("wmo","id","dup");test$source=id[i];
	 stat[i]=nrow(test)#1502
	 provenance=rbind(provenance,test)	
	 }
	# 1502 1221  371 = 3094 
	provenance$dup=paste(provenance$id,provenance$dup,sep="")


##1. hansen_bias

   #this function calculates the "Hansen bias" between two time series by comparing the annual averages
   #calculated by "Hansen summing" in an overlap period 
  
   #two formats are allowed for: default method="hansen" uses the table layout used in GISTEMP: year, 12 months, 4 quarters, annual
   #method = "ts" will compare two ANNUAL time series

  hansen_bias=function(X,Y,method="hansen") {
     if(method=="hansen"){
	temp1=!is.na(X$ANN); temp2=!is.na(Y$ANN)
	test=match(Y$year[temp2],X$year[temp1]);temp=!is.na(test)
	A=Y$ANN[temp2][temp];#A
	B= X$ANN[temp1][test[temp]]
	a=sum(A-B)
	K=length(A)
	 hansen_bias= a/K
      } 
      else {
 	temp1=!is.na(X); temp2=!is.na(Y)
	test=match(c(time(Y))[temp2],c(time(X))[temp1]);temp=!is.na(test)
	A=Y[temp2][temp];#A
	B= X[temp1][test[temp]]
	#a= floor(100*A -100*B); a=sum(a)/100
	a=sum(A-B)
	K=length(A)
    	hansen_bias= a/K
      }
      hansen_bias
    }

   #hansen_bias(station[[2]],station[[5]])


###2.  hansenq

   ## from monthly data with missing values, this function calculates quarterly and annual averages the "Hansen" way
   ## monthly averages are calculated for all months and also quarterly averages ("normal"); over available dates not a reference period
   ##quarters are DJF, MAM, JJA, SON 
   ## if there are 2 or 3 months in a quarter, the quarterly average is the average anomaly in available months plus the quarterly normal  
   ##if there are 3 or 4 quarters, the 

  hansenq=function(X,method="hansen") {
	if(method=="hansen") year=X$year else year= floor(tsp(X)[1]):floor(tsp(X)[2])
	N=length(year);start0=min(year) #32 18
	if(method=="hansen") X=as.matrix(X[,2:13]) else X=t( array(X,dim=c(12,length(X)/12) ) )
	m0=apply(X,2,mean,na.rm=TRUE)  #calculate mean over all values: possibly different
	q0=  apply( array(m0[c(12,1:11)],dim=c(3,4)),2,mean) #calculate quarterly mean
	q0=c(q0,mean(m0) ) #append annual mean
   	Xanom=scale(X,center=m0,scale=FALSE)   #calculate anomaly version calendar year
	x=c(t(Xanom));x=c(NA,x[1:(length(x)-1)]);Xanom=t(array(x,dim=c(12,N) ) ) #re-shape to Dec-Nov
	Xanom=t(array(t(Xanom),dim=c(3,N*4)) )  #re-shape array in quarter-rows rather than annual-rows
	count=apply(!is.na(Xanom),1,sum) #calculate non-missing values by quarter
	Qanom= apply(Xanom,1,mean,na.rm=TRUE); Qanom[count<2]=NA #calculate anomaly by quarter; set to NA if less than 2 values
	Qanom= t( array(Qanom,dim=c(4,length(Qanom)/4) ) ) #reshape into array of dimension nyear x 4
	count=apply(!is.na(Qanom),1,sum) #calculate non-missing values as with quarters
	Qanom=cbind(Qanom,apply(Qanom,1,mean,na.rm=TRUE)  ); dimnames(Qanom)[[2]]=c("DJF","MAM","JJA","SON","ANN")
	Qanom[count<3,"ANN"]=NA  #calculate annual anomaly as average of available quarters; set to NA if fewer than 3 qtrs available
	Qanom=scale(Qanom,center=-q0,scale=FALSE)  #re-center based on "normals"
	hansenq=ts(Qanom ,start=start0)  #make an annual ts object
	hansenq
	}

	#test=hansenq(station[[i]]);test1=station[[i]][,14:18];data.frame(test,test1)
	#x=ts( c(t(station[[1]][,2:13])),start=c(station[[1]][1,1],1),freq=12);hansenq(x,method="ts")

##3. recent 
   # a little function that calculates the most recent value in a time series; used in ordering 

    recent=function(x) { temp=!is.na(x);recent=max(time(x)[temp]); recent}   

##4. overlap_hansen 
   # a little function that calculates the length of overlap between two ANNUAL time series
   #used in ordering

    overlap_hansen=function(x,y){
	temp1=!is.na(x);temp2=!is.na(y);year1=c(time(x));year2=c(time(y))
	temp=!is.na(match(year1[temp1],year2[temp2]))
	overlap_hansen=sum(temp)
	overlap_hansen
    }

###5. 
trim=function (chron) {  
	M0=range(time(chron)[!is.na(chron)])
	trim=ts(chron[ (time(chron)>=M0[1])&(time(chron)<=M0[2]) ],start=M0[1])
	trim}
###6.

long=function(x){ x=trim(x); M0=range(time(x));long=M0[2]-M0[1]+1;long}

	
##7. emulate_dset1

   #this is a function that emulates Hansen's calculation of dset=1 from dset=0
   #the parameter id0 is the 11-digit GISS id for a station: I find it best to use characters 
   #the default setting will download the station data from GISS	
	#this is CCCWWWWWIII - 3 -digit country; 5-digit WMO; 3-digit local
   	#this will download current dset0 and dset1 from GISS
   #you can use available station versions as long as they are in Hansen-format as station0 (dset=0) and station1(dset=1) 
  
   # variety of objects returned for diagnostic purposes
   	# $station0 the input dset0 as monthly ts
	# $annual0  -input dset0 as annual
	# $station1  - input dset1 monthly as ts
	# $annual1  - input dset1 annual as ts
	# $adjusted  -Hansen adjusted monthly
	# $adjusted_annual - Hansen adjusted annual
	# $reference  - closing Hansen quarterly as ts
	# delta  - adjustments for each version
	#index  - order of selection
 

 emulate_dset1=function(id0,station0=NA,station1=NA,download0=TRUE,prov=provenance){
     #optional downloading
  	if (download0) {
  	  if (class(id0)=="numeric") {
		id0=as.character(id0);
		J=nchar(id0);
		if (J<11) id0=paste(rep("0",11-J),id0,sep="")
 	  }	
	  station0=read.giss(id0,dset=0)
	  station1=read.giss(id0,dset=1)[[1]]
	}  #this will dowload GISS dset=0 and GISS dset=1 versions

     #collation into time-series objects
	annual1=ts(station1[,18],start=station1[1,1])
	station1=ts( c(t(station1[,2:13])),start=c(station1[1,1],1),freq=12)
	K=length(station0) #calculate number of dset=0 versions
	X=NULL; 
	for (i in 1:K) {test= station0[[i]];
		 y=ts( c(t(test[,2:13])) ,start=c(test[1,1],1),freq=12);
		X=ts.union(X,y)}
	if(K==1) X= ts(as.matrix(X),start=c(test[1,1],1),freq=12);
	dimnames(X)[[2]]=names(station0)
		#collate dset=0 monthly data into a time-series object
	annual=NULL
	for (i in 1:K) {test= station0[[i]];
		 y=ts(test[,18],start=test[1,1]);
		annual=ts.union(annual,y)}
	if(K==1) annual= ts(as.matrix(annual),start=test[1,1]);
	dimnames(annual)[[2]]=names(station0)
		#collate dset=0 annual data into a time-series object


      #iterative procedure for combining dset=0 series	
       #set-up initial iteration
     fixed=1; # set as the first 
     if( K>1) {		
	long0=apply(annual,2,long) 
	fixed=min ( (1:K)[long0==max(long0,na.rm=T)],na.rm=T)
	temp=!is.na(match(names(station0),prov$dup[prov$source=="sumofday"]) ); if(sum(temp)==1) fixed=(1:K)[temp]
	temp=!is.na(match(names(station0),prov$dup[prov$source=="ushcn"]) ); if(sum(temp)==1) fixed=(1:K)[temp]
	temp=!is.na(match(names(station0),prov$dup[prov$source=="mcdw"]) ); if(sum(temp)==1) fixed=(1:K)[temp]
	}	

	available=(1:K)[is.na(match(1:K,fixed))] #set all other columns as available
	fixed_stations=X[,fixed]; #matrix of included stations: starts with column K only
	interim=fixed_stations    #interim value starts with the one series in the matix
	fixed_annual=annual[,fixed]  #annual starts with column K
	reference=hansenq(interim,method="ts")  # calculates Hansen quarters+annual  
	delta=rep(NA,K);delta[fixed]=0  #set up vector to store deltas
	index=fixed
	
     #do iteration for (K-1) series; 	
     if(K>1) { 
	for (j in 2:K) {
	 if( !(is.null(available) )) {
	  M=length(available);
	  overlap0=rep(NA,M);  
	  for (i in 1:M){ overlap0[i]= overlap_hansen(x=reference[,5],y=annual[,available[i] ])}  
		#calculates overlap with "fixed"
	  recent0=rep(NA,M);recent0=apply(as.matrix(annual[,available]),2,recent)
		#calculates most recent  
		#check if there is an overlap of at least 4
	    if (max(overlap0)<4) available=NULL else {
	     j0=available[order(overlap0,recent0,decreasing=TRUE)][1]  #chooses the series to be added
	     delta[j0]=hansen_bias(annual[,j0], reference[,5],method="ts");#calculates Hansen delta
	     index=c(index,j0)
	     fixed=c(fixed,j0) #add new inclusion to fixed and then same with time series
	     available=available[is.na(match(available,j0))];if(length(available)==0) available=NULL  #calculate new available
	     fixed_stations=cbind(fixed_stations,X[,j0]+delta[j0]);dimnames(fixed_stations)[[2]]=dimnames(X)[[2]][fixed]
             fixed_annual=cbind(fixed_annual,annual[,j0]+delta[j0]);dimnames(fixed_annual)[[2]]=dimnames(X)[[2]][fixed]
  	     interim=ts(   apply(fixed_stations,1,mean,na.rm=TRUE)  , start=c( tsp(annual)[1],1) ,freq=12) #re-calculate interim
	     reference=hansenq(interim,method="ts")  #calculate Hansen quarters for interim
    	    }
	  } #available- if clause 
	  } # j-iteration
       } #endif

	emulate_dset1=list(X,annual,station1,annual1,fixed_stations,fixed_annual,reference,delta,index)
	names(emulate_dset1)=c("station0","annual0","station1","annual1","adjusted","adjusted_annual","reference","delta","index")
	emulate_dset1
	}	
#################

##CASE STUDY
	
	#id0="22340405000"#Gassim,
	#station0=read.giss(id0)

	#test=hansenq(station0[[1]])
	#round(test,2)- as.matrix(station0[[1]][,14:18])
	  #all values are within 0.05 

	#test=emulate_dset1(id0);
	#round(test$delta,2)
		# 0.15 0.00


	#round(test$delta,2)
		# -0.34 -0.16  0.00

	
	#id0="22230309000"#bratsk
	#station0=read.giss(id0)
	#test=emulate_dset1(id0);
	#x=round(test$reference[,5],2);
	#combine=data.frame(test$annual1,x,test$annual1-x);combine
		#matches closely
	#round(test$delta,2)  #Hansen "biases" for each version
		#[1] -0.01 -0.07 -0.07 -0.12  0.00

	#id0="22230554000" #bagdarin
	#test=emulate_dset1(id0);
	#round(test$delta,2)
		# -0.34 -0.16  0.00


	#id0="22220674000"
	#id0="22223022001" #WAigatz
	#id0="22231829000";#cerdyn