########################################################################################################### ### ### ### SECTION 1: GENERIC FUNCTIONS ### ### ### ########################################################################################################### ############################### ### ### SUM-OF-SQUARES ### ############################### .em.ssq = ssq = function (X) {sum(X ^ 2)} ############################### ### ### MATRIX SCALING, CENTERING, ### AND ADDITION ### ############################### .em.mScale = function (X, d = 1, scale = TRUE, center = FALSE, add = FALSE, type = 1, na.rm = TRUE, max.size = 500) { ### Convert max.size (command line argument is in thousands) max.size = max.size * 1000 ### Get matrix data n = nrow(X) p = ncol(X) pts = n * p ts.test = FALSE if (is.ts(X)) { ts.st = start(X) ts.fr = frequency(X) tsp(X) = NULL ts.test = TRUE } ### Check for proper length of d & correct ADD setting if (length(d) == 1 & type == 1 & (scale == TRUE | add == TRUE)) {d = rep(d, p)} if (length(d) == 1 & type == 2 & (scale == TRUE | add == TRUE)) {d = rep(d, n)} if (length(d) != p & type == 1 & (scale == TRUE | add == TRUE)) {warning("Scaling vector length different than number of columns.\nScaling vector recycled.", call. = FALSE, immediate. = TRUE)} if (length(d) < n & type == 2 & (scale == TRUE | add == TRUE)) {warning("Scaling vector length less than number of rows.\nNAs produced.", call. = FALSE, immediate. = TRUE)} if (length(d) > n & type == 2 & (scale == TRUE | add == TRUE)) {warning("Scaling vector length greater than number of rows.\nScaling vector truncated.", call. = FALSE, immediate. = TRUE)} ### If matrix too big, break into chunks if (pts > max.size) { blocks = ceiling(pts / max.size) rows = ceiling(n / blocks) } else { blocks = 1 rows = n } ### Do scaling / centering for (i in 1:blocks) { ### Get start and end points st = 1 + (i - 1) * rows en = min(i * rows, n) ### Column centering if (type == 1 & center == TRUE & en > st) { cd = colMeans(X, na.rm = na.rm) s = matrix(cd, ncol = p, nrow = en - st + 1, byrow = TRUE) X[st:en, ] = X[st:en, ] - s rm(cd, s) } ### Row centering if (type == 2 & center == TRUE & en > st) { cd = rowMeans(X, na.rm = na.rm) s = matrix(cd[st:en], ncol = p, nrow = en - st + 1) X[st:en, ] = X[st:en, ] - s rm(cd, s) } ### Column scaling / adding if (type == 1 & (scale == TRUE | add == TRUE) & en > st) { s = matrix(d, ncol = p, nrow = en - st + 1, byrow = TRUE) if (scale == TRUE) {X[st:en, ] = X[st:en, ] * s} if (add == TRUE) {X[st:en, ] = X[st:en, ] + s} rm(s) } ### Row scaling / adding if (type == 2 & (scale == TRUE | add == TRUE) & en > st) { s = matrix(d[st:en], ncol = p, nrow = en - st + 1) if (scale == TRUE) {X[st:en, ] = X[st:en, ] * s} if (add == TRUE) {X[st:en, ] = X[st:en, ] + s} rm(s) } } ### Cleanup & return if (ts.test == TRUE) {X = ts(X, start = ts.st, freq = ts.fr)} X } ############################### ### ### PAIRED, FULL, & PIECEWISE ### COVARIANCE AND CORRELATION ### ############################### .em.covFn = function (X, Y, cols = TRUE, paired = TRUE, cor = FALSE, DoFL = NA, pad = FALSE) { ### Compare existing points only (only useful if paired) if (paired == TRUE) { map = is.na(X) | is.na(Y) X[map] = NA Y[map] = NA } else { map = matrix(FALSE, nrow = nrow(X), ncol = ncol(X)) } ### Variables in columns if (cols == TRUE) { X = t(X) - colMeans(X, na.rm = TRUE) Y = t(Y) - colMeans(Y, na.rm = TRUE) ### Variables in rows } else { X = X - rowMeans(X, na.rm = TRUE) Y = Y - rowMeans(Y, na.rm = TRUE) } ### Find effective degrees of freedom if (pad == FALSE) { n.eff = colSums(!map) - DoFL n.eff[n.eff < 2 * DoFL] = NA } else { n.eff = ncol(X) - DoFL } ### Calculate if (paired == TRUE) { if (cor == FALSE) { r = (rowSums(X * Y, na.rm = TRUE)) / n.eff } else { r = rowSums(X * Y, na.rm = TRUE) / sqrt(rowSums(X ^ 2, na.rm = TRUE) * rowSums(Y ^ 2, na.rm = TRUE)) } } else { if (cor == FALSE) { r = X %*% t(Y) / n.eff } else { r = ((X / sqrt(rowSums(X ^ 2))) %*% t(Y / sqrt(rowSums(Y ^ 2)))) } } ### Return r } ############################### ### ### NEAREST TRUE CORRELATION / ### COVARIANCE MATRIX ### ############################### .em.cCor = function (X, w = NA, tol = 1e-6, plt = FALSE, maxiter = 250) { ### Setup variables S = matrix(0, ncol = ncol(X), nrow = nrow(X)) D = rep(1, nrow(X)) if (identical(w, NA)) {w = rep(1, nrow(X))} w.5 = sqrt(w) iter = 1 err = 1e12 ### Verify X is correlation; if not, convert if (!identical(diag(X), rep(1, nrow(X)))) { D = sqrt(diag(X)) Y = X = Xo = (1 / D) * X %*% diag(1 / D, ncol = nrow(X)) } while (iter < maxiter & err > tol) { ### Previous iteration error correction R = Y - S ### Project R onto subspace S P = diag(w.5) %*% R %*% diag(w.5) e = eigen(P, symmetric = TRUE) rank = c(1:length(e$values))[e$values >= 0] P.plus = e$vectors[, rank] %*% diag(e$values[rank], ncol = max(rank)) %*% t(e$vectors[, rank]) ### Save projection in matrix X X = diag(1 / w.5) %*% P.plus %*% diag(1 / w.5) ### Find error between projections S = X - R ### Set the diagonal of Y to 1 Y = X diag(Y) = 1 ### Update iter = iter + 1 err = sqrt(abs(sum(diag(X - Y)))) ### Plot? if (plt == TRUE) { plot(as.vector(Xo), col = 1, pch = 15, cex = 0.8, xaxt = "n", ylab = "Correlation Coefficient", xlab = "Black: Original\nBlue: S-Projection\nGreen: U-Projection", main = paste("Iteration: ", iter, "\nError (Froebenius Norm of Difference in S and U Projections): ", round(err, 6), sep = "")) points(as.vector(X), col = 4, pch = 15, cex = 0.6) points(as.vector(Y), col = 3, pch = 15, cex = 0.4) } } ### Truncate negative eigenvalues e = eigen(Y, symmetric = TRUE) Y = e$vectors[, rank] %*% diag(e$values[rank], ncol = max(rank)) %*% t(e$vectors[, rank]) diag(Y) = 1 ### Convert Y to covariance & return Y = D * Y %*% diag(D, ncol = nrow(X)) ccov = list() ccov$cov = Y ccov$rank = max(rank) ccov } ############################### ### ### CENTERING FOR CYCLO- ### STATIONARY AND STATIONARY ### DATA ### ############################### .em.center = function (X, DoFL = 12, idx = NA, robust = FALSE) { ### Get data size n = nrow(X) p = ncol(X) ### Matrix for saving centering matrix center = matrix(nrow = n, ncol = p) ### Adjust means for each lost DoF for (i in 1:DoFL) { ### Get the cycle row.idx = seq(i, n, DoFL) temp = t(X[row.idx, ]) ### Supplied baseline? if (!identical(idx, NA)) {temp[t(!idx[row.idx, ])] = NA} ### Use mean or median? if (robust == FALSE) {center.vec = rowMeans(temp, na.rm = TRUE)} else {center.vec = apply(temp, 1, median, na.rm = TRUE)} ### Calculate centering vector & turn into a matrix center.vec[is.na(center.vec)] = 0 center[row.idx, ] = matrix(rep(center.vec, length(row.idx)), nrow = length(row.idx), byrow = TRUE) ### Apply via matrix addition X[row.idx, ] = t(t(X[row.idx, ]) - center.vec) } ### Return center = list(X = X, center = center) center } ########################################################################################################### ### ### ### SECTION 2: DATA PREP / COMMON FUNCTIONS ### ### ### ########################################################################################################### ############################### ### ### CHECKS FOR SUFFICIENT NO. ### OF POINTS ### ############################### .em.dofChk = function (missing, DoFL, p) { ### Get original values n.orig = colSums(!missing) ### Too few values for mean calculations? n.low = n.orig < DoFL + 1 ### Yep . . . so error out if (sum(n.low) > 0) { stop.cols = paste(" ", c(1:p)[n.low], "\n", sep = "") stop(paste("Insufficient number of points (less than ", DoFL + 1, ") in variables", stop.cols, sep = "")) } ### Return original values n.orig } ############################### ### ### PREPS COVARIANCE MATRIX ### PRIOR TO FEEDING TO ### REGRESSION FUNCTIONS ### ############################### .em.prepC = function (X, C, scales, missing, ccov, n, p, DoFL) { ### Supplied C? if (ccov == TRUE & identical(C, NA)) { ### Setup matrix C = diag(scales ^ 2, ncol = p) ### Populate off-diagonal terms for (i in 2:p) { base = matrix(rep(X[, i], i - 1), ncol = i - 1) comp = matrix(X[, 1:(i - 1)], ncol = i - 1) est.cov = .em.covFn(base, comp, paired = TRUE, cor = FALSE, cols = TRUE, DoFL = DoFL, pad = FALSE) est.cov[is.na(est.cov)] = 0 C[i, 1:(i - 1)] = C[1:(i - 1), i] = est.cov } ### Find the closest correlation matrix to C C = .em.cCor(C)$cov } ### Initial infill X[missing] = 0 Xhat = matrix(0, nrow = n, ncol = p) if (ccov == FALSE & identical(C, NA)) {C = t(X) %*% X / (n - DoFL)} ### Return list(X, C) } ############################### ### ### BUILD LIST OF UNIQUE ### PATTERNS ### ############################### .em.unique = function (X, missing) { ### Get data size & set up storage n = nrow(X) p = ncol(X) master.cols = c(1:p) master.rows = c(1:n) common = list() miss = list() avail = list() ### Check for any complete rows complete = rowSums(missing) == p if (sum(complete) > 0) {master.rows = master.rows[!complete]} ### Set up first pattern common[[1]] = master.rows[1] miss[[1]] = master.cols[missing[master.rows[1], ]] avail[[1]] = master.cols[!missing[master.rows[1], ]] master.rows = master.rows[-1] ### Find patterns for (i in master.rows) { ### Set flag found = FALSE ### Pattern already identified? for (j in 1:length(common)) { ### Yep, so assign and exit loop if (identical(miss[[j]], master.cols[missing[i, ]])) { common[[j]] = c(common[[j]], i) found = TRUE j = length(common) } } ### Nope, so set new pattern if (found == FALSE) { ### Save pattern common[[length(common) + 1]] = i miss[[length(common)]] = master.cols[missing[i, ]] avail[[length(common)]] = master.cols[!missing[i, ]] } } ### Return patterns = list(common = common, miss = miss, avail = avail) patterns } ############################### ### ### SETUP RIDGE REGRESSION ### STATION LISTS ### ############################### .em.subProblems = function (data.patterns, type) { ### Get number of unique combinations p = length(data.patterns$common) ### Set up storage list miss = list() ### Cycle through each combination for (i in 1:p) { ### Set up list miss[[i]] = list() ### Get the predictands for the combination being analyzed m = data.patterns$miss[[i]] ### For multiple regression, assign all if (type != "iridge") {miss[[i]][[1]] = m} ### For individual regression, assign each individually if (type == "iridge") {miss[[i]] = as.list(m)} } ### Return miss } ############################### ### ### CALCULATE HEURISTIC BOUNDS ### ON REGULARIZATION ### ############################### .em.ridgeLimit = function (manual, relvar, d.var, rank.eff, iter, d, h.tol, n.eff, tr.E0, tr.S) { ### No manually supplied regularization parameter if (identical(NA, manual)) { ### Calculate upper bound using either the minimum retained variance specification (relvar, if supplied) or the largest eigenvalue if (relvar > min(d.var)) { ### Find eigenvalue closest to relvar q = min(max(which(d.var < relvar)), length(d) - 1) ### Set limit max.h = (d[q] + (d[q + 1] - d[q]) * (relvar - d.var[q]) / (d.var[q + 1] - d.var[q])) } else { ### Otherwise, set max based on largest eigenvalue and the minimum discrimination max.h = sqrt(max(d) / h.tol) } ### Calculate lower bound using truncated SVD solutions if (tr.E0 > tr.S) { min.h = sqrt(.Machine$double.eps) } else { ### Find closest eigenvalue r = abs(c(rank.eff:1) * (tr.E0 + sqrt(.Machine$double.eps)) - tr.S) min.r = c(1:rank.eff)[r == min(r)] ### Set min based on closest eigenvalue min.h = sqrt(max(d[min.r], min(d) / n.eff)) } ### Check for sensible limits if (min.h > max.h) { warning(paste("Upper bound less than lower bound on regularization parameter (iteration: ", iter, ")\n", sep = "")) max.h2 = min.h min.h = max.h max.h = max.h2 } } ### Manually supplied regularization parameter if (!identical(NA, manual)) { ### Use manually supplied parameter min.h = max.h = manual } ### Return c(min.h, max.h) } ############################### ### ### CALCULATE HOLDOUT ERROR ### ############################### .em.holdout = function (X, covr, wts, char, DoFL, missing, cv.map, n.err, robust) { ### Center on calibration periods X0 = .em.center(char$X0, DoFL, !missing, robust)[[1]] X = .em.center(X, DoFL, !missing, robust)[[1]] ### Find scales and weighting factors if (covr == TRUE) { D = wts / rep(1, ncol(X)) } else { D = char$X0.scales D = wts / D } ### Error X0.rel = t(t(X0) * D) X.rel = t(t(X) * D) ### Calculate absolute cross-validation error err = (X0 - X)[cv.map] cv.abs.err = sqrt(sum(err ^ 2) / n.err) ### Calculate relative cross-validation error err = (X0.rel - X.rel)[cv.map] cv.err = sqrt(sum(err ^ 2) / n.err) ### Return list(cv.err, cv.abs.err) } ########################################################################################################### ### ### ### SECTION 3: INTERNAL PLOTTING FUNCTIONS ### ### ### ########################################################################################################### ############################### ### ### PLOT A GIVEN SERIES, ### ACTUAL & ESTIMATED DATA ### ############################### .em.plotSeries = function (arg, dat, char, n.eof, iter, series) { ### Get X and Xhat X = dat$X[, series] Xhat = dat$Xhat[, series] ### Remove missing values from X arg$missing = arg$missing[, series] X[arg$missing] = NA ### Get plot limits y.lim = c(min(X, Xhat, na.rm = TRUE), max(X, Xhat, na.rm = TRUE)) ### Get title main.txt = paste("Series ", series, " - Iteration: ", iter, "; Actual (Black) & Estimated (Red)", sep = "") ### Get y-axis label y.lab = "Deg C" ### Get plot type X.type = "l" Xhat.type = "l" ### Get plotting characters X.pch = 1 Xhat.pch = 1 ### Get plotting character size X.cex = 1 Xhat.cex = 1 ### Get colors X.col = 1 Xhat.col = 2 ### Plot plot(Xhat, ylim = y.lim, type = Xhat.type, col = Xhat.col, xaxt = "n", ylab = y.lab, main = main.txt, pch = Xhat.pch, cex = Xhat.cex, cex.main = 1) abline(h = 0, col = 4) points(Xhat,type = Xhat.type, col = Xhat.col, pch = Xhat.pch, cex = Xhat.cex) points(X, type = X.type, col = X.col, pch = Xhat.pch, cex = Xhat.cex) } ############################### ### ### PLOT INITIAL VARIANCE ### ESTIMATE, COVARIANCE ### MATRIX DIAGONAL, AND ### VARIANCE OF ESTIMATED ### VALUES ### ############################### .em.plotVar = function (arg, dat, char, n.eof, iter, series) { ### Extract original variance estimate orig = char$scales ^ 2 ### Extract covariance matrix diagonal diag.C = diag(dat$C) ### Calculate actual variance for estimated values (based on calibration period mean being zero) Xhat = dat$X Xhat[!arg$missing] = NA act = colSums(Xhat ^ 2, na.rm = TRUE) / (colSums(arg$missing) - arg$DoFL) lo.idx = colSums(arg$missing) - arg$DoFL <= 2 * arg$DoFL if (sum(lo.idx) > 0) {act[lo.idx] = apply(matrix(Xhat[, lo.idx], ncol = sum(lo.idx)), 2, var)} ### Get plot limits y.lim = c(0, max(diag.C, orig, act, na.rm = TRUE)) ### Get span span = c(1:length(orig)) ### Get line drawing vectors for diag.C C.x = as.vector(t(matrix(c(span - 0.15, span - 0.15, rep(NA, length(span))), ncol = 3))) C.err = as.vector(t(matrix(c(orig, diag.C, rep(NA, length(span))), ncol = 3))) ### Get line drawing vectors for act act.x = as.vector(t(matrix(c(span + 0.15, span + 0.15, rep(NA, length(span))), ncol = 3))) act.err = as.vector(t(matrix(c(orig, act, rep(NA, length(span))), ncol = 3))) ### Get line drawing vectors for span orig.span = as.vector(t(matrix(c(span - 0.15, span + 0.15, rep(NA, length(span))), ncol = 3))) orig.var = as.vector(t(matrix(c(orig, orig, rep(NA, length(span))), ncol = 3))) ### Get title main.txt = "Variance (Black: Original Estimate; Red: Cov. Matrix Diagonal; Green: Infilled Only)" ### Get y-axis label y.lab = "Variance" ### Get plot type orig.type = "h" diag.C.type = "p" act.type = "p" ### Get plotting characters orig.pch = 15 diag.C.pch = 15 act.pch = 15 ### Get plotting character size orig.cex = 0.75 diag.C.cex = 0.75 act.cex = 0.75 ### Get colors orig.col = 1 diag.C.col = 2 act.col = 3 ### Plot plot(orig, xlim = c(0, length(orig) + 1), ylim = y.lim, type = orig.type, col = orig.col, xaxt = "n", ylab = y.lab, main = main.txt, pch = orig.pch, cex = orig.cex, cex.main = 1) abline(h = 0, col = 4) points(orig, type = act.type, col = orig.col, pch = orig.pch, cex = orig.cex) lines(x = orig.span, y = orig.var, col = orig.col) lines(x = C.x, y = C.err, col = diag.C.col) lines(x = act.x, y = act.err, col = act.col) points(diag.C ~ I((1:arg$p) - 0.15), type = diag.C.type, col = diag.C.col, pch = diag.C.pch, cex = diag.C.cex) points(act ~ I((1:arg$p) + 0.15), type = act.type, col = act.col, pch = act.pch, cex = act.cex) } ############################### ### ### PLOT INITIAL VARIANCE ### ESTIMATE, COVARIANCE ### MATRIX DIAGONAL, AND ### VARIANCE OF ESTIMATED ### VALUES (TSVD SPECIFIC) ### ############################### .em.plotVarTsvd = function (arg, dat, char, n.eof, iter, series) { ### Extract original variance estimate orig = char$scales ^ 2 ### Calculate actual variance for estimated values (based on calibration period mean being zero) Xhat = dat$X Xhat[!arg$missing] = NA act = colSums(Xhat ^ 2, na.rm = TRUE) / (colSums(arg$missing) - arg$DoFL) lo.idx = colSums(arg$missing) - arg$DoFL <= 2 * arg$DoFL if (sum(lo.idx) > 0) {act[lo.idx] = apply(matrix(Xhat[, lo.idx], ncol = sum(lo.idx)), 2, var)} ### Get plot limits y.lim = c(0, max(orig, act, na.rm = TRUE)) ### Get span span = c(1:length(orig)) ### Get line drawing vectors for act act.x = as.vector(t(matrix(c(span + 0.15, span + 0.15, rep(NA, length(span))), ncol = 3))) act.err = as.vector(t(matrix(c(orig, act, rep(NA, length(span))), ncol = 3))) ### Get line drawing vectors for span orig.span = as.vector(t(matrix(c(span, span + 0.15, rep(NA, length(span))), ncol = 3))) orig.var = as.vector(t(matrix(c(orig, orig, rep(NA, length(span))), ncol = 3))) ### Get title main.txt = "Variance (Black: Original Estimate; Red: Cov. Matrix Diagonal; Green: Infilled Only)" ### Get y-axis label y.lab = "Variance" ### Get plot type orig.type = "h" act.type = "p" ### Get plotting characters orig.pch = 15 act.pch = 15 ### Get plotting character size orig.cex = 0.75 act.cex = 0.75 ### Get colors orig.col = 1 act.col = 3 ### Plot plot(orig, ylim = y.lim, type = orig.type, col = orig.col, xaxt = "n", ylab = y.lab, main = main.txt, pch = orig.pch, cex = orig.cex, cex.main = 1) abline(h = 0, col = 4) points(orig, type = act.type, col = orig.col, pch = orig.pch, cex = orig.cex) lines(x = orig.span, y = orig.var, col = orig.col) lines(x = act.x, y = act.err, col = act.col) points(act ~ I((1:arg$p) + 0.15), type = act.type, col = act.col, pch = act.pch, cex = act.cex) } ############################### ### ### PLOTS THE SINGULAR VALUE ### SPECTRUM & SAMPLING ERROR ### ESTIMATE (NORTH et al., ### 1982) ### ############################### .em.plotSV = function (arg, dat, char, n.eof, iter, series) { ### Extract eigenvalue spectrum d = sqrt(dat$d * arg$n) d[d < sqrt(.Machine$double.eps)] = sqrt(.Machine$double.eps) ### Calculate sampling error err = sqrt(2 / arg$n) * d ### Calculate error bounds max.err = d + err min.err = d - err min.err[min.err < sqrt(.Machine$double.eps)] = sqrt(.Machine$double.eps) ### Calculate x-axis span span = c(1:length(d)) ### Get line drawing vectors x.err = as.vector(t(matrix(c(span, span, rep(NA, length(d))), ncol = 3))) y.err = as.vector(t(matrix(c(max.err, min.err, rep(NA, length(d))), ncol = 3))) x.err.ends = rep(as.vector(t(matrix(c(span + 0.2, span - 0.2, rep(NA, length(d))), ncol = 3))), 2) y.err.ends.hi = as.vector(t(matrix(c(max.err, max.err, rep(NA, length(d))), ncol = 3))) y.err.ends.lo = as.vector(t(matrix(c(min.err, min.err, rep(NA, length(d))), ncol = 3))) y.err.ends = c(y.err.ends.hi, y.err.ends.lo) ### Get plot limits y.lim = c(min(min.err), max(max.err, na.rm = TRUE)) ### Get title main.txt = paste("Singular Value Spectrum with Sampling Error (Truncation Parameter: ", n.eof, ")", sep = "") ### Get y-axis label y.lab = "Variance (Log Scale)" ### Get plot type d.type = "h" err.type = "p" ### Get plotting characters d.pch = 1 err.pch = 15 ### Get plotting character size d.cex = 1 err.cex = 0.5 ### Get colors d.col = 1 err.col = 2 ### Plot plot(d, ylim = y.lim, log = "y", type = d.type, col = d.col, xaxt = "n", ylab = y.lab, main = main.txt, pch = d.pch, cex = d.cex, cex.main = 1) lines(x = x.err, y = y.err, col = err.col) lines(x = x.err.ends, y = y.err.ends, col = err.col) points(d, col = d.col, pch = err.pch, cex = err.cex, type = err.type) } ############################### ### ### PLOTS EFFECTIVE PARAMETERS ### ############################### .em.plotParams = function (arg, dat, char, n.eof, iter, series) { ### Extract & format parameters p = dat$p.eff p[!arg$missing] = NA p = rowMeans(p, na.rm = TRUE) p[is.na(p)] = 0 ### Get plot limits y.lim = c(0, max(p, na.rm = TRUE)) ### Get title main.txt = "Effective Parameters (Time Step Average)" ### Get y-axis label y.lab = "Effective Params" ### Get plot type p.type = "p" ### Get plotting characters p.pch = 15 ### Get plotting character size p.cex = 0.5 ### Get colors p.col = 1 ### Plot plot(p, ylim = y.lim, type = p.type, col = p.col, xaxt = "n", ylab = y.lab, main = main.txt, pch = p.pch, cex = p.cex, cex.main = 1) abline(h = 0, col = 4) } ############################### ### ### PLOTS ESTIMATED IMPUTATION ### ERROR ### ############################### .em.plotEstErr = function (arg, dat, char, n.eof, iter, series) { ### Get data rel.err = dat$rel.err len = length(rel.err) ### Get plot limits y.lim = c(0, max(rel.err, na.rm = TRUE)) ### Get title main.txt = "Estimated Imputation Error" ### Get y-axis label y.lab = paste(ifelse(arg$covr == TRUE, "Absolute", "Relative"), " Error") ### Get plot type e1.type = "p" e2.type = "l" ### Get plotting characters e1.pch = 15 e2.pch = 1 ### Get plotting character size e1.cex = 0.5 e2.cex = 1 ### Get colors e1.col = 1 e2.col = 1 ### Plot plot(rel.err, ylim = y.lim, type = e1.type, col = e1.col, xaxt = "n", ylab = y.lab, main = main.txt, pch = e1.pch, cex = e1.cex, cex.main = 1) abline(h = 0, col = 4) points(rel.err, type = e2.type, col = e2.col, pch = e2.pch, cex = e2.cex) legend.text = paste(round(rel.err[len], 5), " (Est. Error)", sep = "") legend("bottomleft", legend = legend.text, fill = 1, text.col = 1, cex = 0.75, pt.cex = 0.75, bg = "white") } ############################### ### ### PLOTS ESTIMATED IMPUTATION ### ERROR & HOLDOUT ERROR ### ############################### .em.plotActErr = function (arg, dat, char, n.eof, iter, series) { ### Get data cv.err = dat$cv.err rel.err = dat$rel.err len = length(rel.err) ### Get plot limits if (iter == 1) {y.lim = c(0, 1)} else { y.lim = c(0, max(cv.err[-1], rel.err[-1], na.rm = TRUE)) } ### Get title main.txt = "Black: Actual Holdout Error; Red: Estimated Imputation Error" ### Get y-axis label y.lab = paste(ifelse(arg$covr == TRUE, "Absolute", "Relative"), " Error") ### Get plot type c1.type = "p" c2.type = "l" e1.type = "p" e2.type = "l" ### Get plotting characters c1.pch = 15 c2.pch = 1 e1.pch = 15 e2.pch = 1 ### Get plotting character size c1.cex = 0.5 c2.cex = 1 e1.cex = 0.5 e2.cex = 1 ### Get colors c1.col = 1 c2.col = 1 e1.col = 2 e2.col = 2 ### Plot plot(rel.err, ylim = y.lim, type = e1.type, col = e1.col, xaxt = "n", ylab = y.lab, main = main.txt, pch = e1.pch, cex = e1.cex, cex.main = 1) abline(h = 0, col = 4) points(rel.err, type = e2.type, col = e2.col, pch = e2.pch, cex = e2.cex) points(cv.err, type = c1.type, col = c1.col, pch = c1.pch, cex = c1.cex) points(cv.err, type = c2.type, col = c2.col, pch = c2.pch, cex = c2.cex) legend.text = c(paste(round(cv.err[len], 5), " (Act. Error)", sep = ""), paste(round(rel.err[len], 5), " (Est. Error)", sep = "")) legend("bottomleft", legend = legend.text, fill = c(1, 2), text.col = c(1, 2), cex = 0.75, pt.cex = 0.75, bg = "white") } ########################################################################################################### ### ### ### SECTION 4: REGRESSION FUNCTIONS ### ### ### ########################################################################################################### ############################### ### ### GCV FOR RIDGE PARAMETER ### ############################### .em.gcvFn = function(h, E0, FR, d, n, opt) { ### Filter factors & effective DoF filt = (h ^ 2) / (d + h ^ 2) n.a = n - length(d) n.f = n.a + sum(filt) ### Using Golub or Mallows? if (opt == TRUE) { ### Golub G = sum((n ^ 2 / n.f ^ 2) * (E0 + sum(FR * filt ^ 2))) } else { ### Mallows G.C = E0 + sum(FR * filt ^ 2) G.E = 0 if (sum(n.a) > 0) {G.E = ((2 * n.f - n) / n.a) * E0} G = sum(G.C - G.E) } ### Return G } ############################### ### ### TTLS REGRESSION FUNCTION ### ############################### .em.ttlsRegFn = function (arg, dat, char, n.eof, iter) { ### Initialize B storage if (!is.na(arg$B.series[1]) & arg$B.series[1] != 0) { for (j in arg$B.series) { dat$B[[j]] = matrix(0, nrow = arg$n, ncol = arg$p) attr(dat$B[[j]], "name") = char$r.names[j] attr(dat$B[[j]], "variable number") = j } } ### New X.err, C.res and Xhat C.res = matrix(0, nrow = arg$p, ncol = arg$p) X.err = Xhat = matrix(0, nrow = arg$n, ncol = arg$p) ### Find scales and weighting factors if (arg$covr == TRUE) { D = arg$wts / rep(1, arg$p) } else { D = sqrt(diag(dat$C)) D = arg$wts / D } ### Scale if (arg$covr == FALSE | arg$wts.flag == TRUE) { dat$X = .em.mScale(dat$X, D, type = 1, add = FALSE, center = FALSE, max.size = arg$max.size) dat$Xhat = .em.mScale(dat$Xhat, D, type = 1, add = FALSE, center = FALSE, max.size = arg$max.size) dat$C = .em.mScale(D * dat$C, D, type = 1, add = FALSE, center = FALSE, max.size = arg$max.size) } ### Perform eigendecomposition C.eig = eigen(dat$C, symmetric = TRUE) ### Find non-zero eigenvalues max.rank = max(1, sum(C.eig$values > (C.eig$values[1] * sqrt(.Machine$double.eps)))) dat$d = d = C.eig$values[1:max.rank] dat$V = V = matrix(C.eig$vectors[, 1:max.rank], ncol = max.rank) dat$p.eff = vector() ### Cleanup rm(C.eig) ### Iterate through unique combinations of available/missing values for (i in 1:length(arg$common)) { ### Set truncation parameter rank.eff = max(1, min(n.eof, max.rank, length(arg$avail[[i]]))) ### Get the row indices idx = arg$common[[i]] ### Parse a = arg$avail[[i]] m = arg$miss[[i]][[1]] n.miss = length(m) n.avail = length(a) if (n.miss > 0) { ### Get components Va1 = matrix(V[a, 1:rank.eff], ncol = rank.eff) tVm1 = t(matrix(V[m, 1:rank.eff], ncol = rank.eff)) tV1 = t(matrix(V[, 1:rank.eff], ncol = rank.eff)) ### Set diagnostic parameter for singularity sing = TRUE ### Get inverse while (sing == TRUE) { ### Try to take inverse inv = try(Va1 %*% solve(t(Va1) %*% Va1), silent = TRUE) ### Worked! if (is.matrix(inv)) {sing = FALSE} else { ### Didn't work! rank.eff = rank.eff - 1 Va1 = matrix(V[a, 1:rank.eff], ncol = rank.eff) tVm1 = t(matrix(V[m, 1:rank.eff], ncol = rank.eff)) tV1 = t(matrix(V[, 1:rank.eff], ncol = rank.eff)) } } ### Save rank.eff dat$p.eff[idx] = rank.eff ### Get coefficient matrices B = inv %*% tV1 B2 = inv %*% tVm1 ### Estimated imputation error and residual covariance matrix if (rank.eff < arg$p) { ### Setup len = max.rank - rank.eff mode.idx = c((rank.eff + 1):max.rank) Va2 = matrix(V[a, mode.idx], ncol = len) Vm2 = matrix(V[m, mode.idx], ncol = len) d2 = diag(d[mode.idx], ncol = len) ### Calculate sum.E = Vm2 %*% d2 %*% t(Vm2) X.err[idx, m] = matrix(rep(sqrt(diag(sum.E)), length(idx)), ncol = n.miss, byrow = TRUE) ### Update C.res[m, m] = C.res[m, m] + (length(idx) * arg$inflate * sum.E) } ### Find the solution Xhat[idx, ] = dat$X[idx, a] %*% B ### Store B if (!is.na(arg$B.series[1]) & arg$B.series[1] == 0) { attr(B, "miss") = m attr(B, "rows") = idx dat$B[[i]] = B } if (!is.na(arg$B.series[1]) & arg$B.series[1] != 0) { for (j in arg$B.series) { dat$B[[j]][idx, a] = matrix(B[, j], ncol = length(B[, j]), nrow = length(idx), byrow = TRUE) } } } else { ### No missing values, so no solution Xhat[idx, ] = dat$X[idx, ] } } ### Save X.err & C.res dat$X.err = X.err dat$C.res = C.res / arg$n.eff ### Unscale if (arg$covr == FALSE | arg$wts.flag == TRUE) { dat$X = .em.mScale(dat$X, 1 / D, type = 1, add = FALSE, center = FALSE, max.size = arg$max.size) Xhat = .em.mScale(Xhat, 1 / D, type = 1, add = FALSE, center = FALSE, max.size = arg$max.size) dat$C.res = .em.mScale((1 / D) * dat$C.res, 1 / D, type = 1, add = FALSE, center = FALSE, max.size = arg$max.size) dat$X.err = .em.mScale(dat$X.err, 1 / D, type = 1, add = FALSE, center = FALSE, max.size = arg$max.size) } ### Compute solution rms change in Xhat dat$dXhat[iter] = sqrt(.em.ssq(Xhat[arg$missing] - dat$X[arg$missing]) / arg$n.mis) ### Compute change in mean of missing values temp.X = dat$X + dat$M dat$nXhat[iter] = sqrt(.em.ssq(temp.X[arg$missing]) / arg$n.mis) ### Compute stagnation value dat$rXhat[iter] = dat$dXhat[iter] / dat$nXhat[iter] ### Save X & update X & Xhat dat$X[arg$missing] = Xhat[arg$missing] dat$Xhat = Xhat ### Recenter center = .em.center(dat$X, 1) dat$X = center[[1]] dat$Xhat = dat$Xhat - center[[2]] colnames(dat$X) = colnames(dat$Xhat) = char$c.names rownames(dat$X) = rownames(dat$Xhat) = char$r.names ### Generate C dat$C = (t(dat$X) %*% dat$X) / arg$n.eff + dat$C.res ### Iteration estimated relative imputation error dat$rel.err[iter] = sqrt(sum(X.err[arg$missing] ^ 2) / arg$n.mis) ### Iteration estimated absolute imputation error dat$abs.err[iter] = sqrt(sum(dat$X.err[arg$missing] ^ 2) / arg$n.mis) ### Holdout error if (arg$holdout == TRUE) { err = .em.holdout(dat$X, arg$covr, arg$wts, char, arg$DoFL, arg$missing, arg$cv.map, arg$n.err, arg$robust) dat$cv.err[iter] = err[[1]] dat$cv.abs.err[iter] = err[[2]] } ### Cleanup & return dat } ############################### ### ### RIDGE REGRESSION FUNCTION ### ############################### .em.ridgeRegFn = function (arg, dat, char, n.eof, iter) { ### Initialize B storage if (!is.na(arg$B.series[1]) & arg$B.series[1] != 0) { for (j in arg$B.series) { dat$B[[j]] = matrix(0, nrow = arg$n, ncol = arg$p) attr(dat$B[[j]], "name") = char$r.names[j] attr(dat$B[[j]], "variable number") = j } } ### New storage variables C.res = matrix(0, nrow = arg$p, ncol = arg$p) X.err = Xhat = matrix(0, nrow = arg$n, ncol = arg$p) if (iter == 1) {dat$h = dat$p.eff = dat$v.ret = matrix(0, nrow = arg$n, ncol = arg$p)} ### Find scales and weighting factors if (arg$covr == TRUE) { D = arg$wts / rep(1, arg$p) } else { D = sqrt(diag(dat$C)) D = arg$wts / D } ### Scale if (arg$covr == FALSE | arg$wts.flag == TRUE) { dat$X = .em.mScale(dat$X, D, type = 1, add = FALSE, center = FALSE, max.size = arg$max.size) dat$Xhat = .em.mScale(dat$Xhat, D, type = 1, add = FALSE, center = FALSE, max.size = arg$max.size) dat$C = .em.mScale(D * dat$C, D, type = 1, add = FALSE, center = FALSE, max.size = arg$max.size) } ### Iterate through unique combinations of available/missing values for (i in 1:length(arg$common)) { ### Get pattern information a = arg$avail[[i]] m = arg$miss[[i]] m.all = unlist(arg$miss[[i]]) n.avail = length(a) n.miss = length(m.all) ### Get the row indices idx = arg$common[[i]] ### Compute B, scaled residual covariance matrix, and estimated values if (n.miss > 0) { ###### ### Step 1: Get ridge information for all missing variables ###### ### Setup B B = matrix(diag(rep(1, arg$p))[a, ], nrow = n.avail) ### Perform eigendecomposition C.eig = eigen(dat$C[a, a], symmetric = TRUE) ### Save dat$d = C.eig$values dat$V = C.eig$vectors ### Cleanup rm(C.eig) ### Find number of non-zero eigenvalues, adjusted for effective DoF and truncation rank.eff = min(sum(dat$d > dat$d[1] * sqrt(.Machine$double.eps)), arg$n.eff, arg$trunc, n.avail) dat$d = d = dat$d[1:rank.eff] dat$V = V = matrix(dat$V[, 1:rank.eff], ncol = rank.eff) dat$d.var = d.var = cumsum(d) / sum(d) ### Fourier coefficients Fo = matrix(diag(1 / sqrt(d), nrow = rank.eff) %*% t(V) %*% matrix(dat$C[a, m.all], ncol = n.miss), ncol = n.miss) ### Calculate Fourier norms F2 = Fo ^ 2 FR = rowSums(F2) FC = colSums(F2) ### Diagonal of covariance matrix for missing values if (n.miss > 1) {diag.C = diag(dat$C[m.all, m.all])} else {diag.C = dat$C[m.all, m.all]} ### Minimum bound on trace of the residual covariance matrix min.tr.S = sum(arg$min.relvar * diag.C) ### Minimum actual trace of residual covariance matrix tr.E0 = diag.C - FC ### Regularization parameter discrimination (heuristic) h.tol = 0.02 / sqrt(arg$n.eff) ### Find ridge parameter heuristic bounds h.interval = .em.ridgeLimit(arg$h.default, arg$min.relvar, d.var, rank.eff, iter, d, h.tol,arg$n.eff, sum(tr.E0), min.tr.S) ### Optimize? if (h.interval[1] < h.interval[2]) { ### If true, then h can be optimized h = base.h = optimize(.em.gcvFn, interval = h.interval, tol = h.tol, sum(tr.E0), FR, d, arg$n.eff, arg$opt)$minimum ### Set base.h to FALSE (base.h reserved for future implementation) base.h = FALSE } else { ### If false, then use the manually-supplied parameter h = base.h = arg$h.default } ###### ### Step 2: Solve each subproblem ###### ### Get number of subproblems sub.p = length(m) ### Loop through the subproblems for (j in 1:sub.p) { ### Get missing variables for subproblem sub.m = m[[j]] len.m = length(sub.m) idx.m = c(1:n.miss)[is.element(m.all, sub.m)] ### If more than 1 subproblem and GCV regularization parameter, get appropriate parameter if (sub.p > 1 & identical(NA, arg$h.default)) { ### Calculate Fourier norms F.sub = matrix(F2[, idx.m], ncol = len.m) FR.sub = rowSums(F.sub) ### Minimum bound on trace of the residual covariance matrix min.tr.S = sum(arg$min.relvar * diag.C[idx.m]) ### Minimum actual trace of residual covariance matrix tr.E0.sub = (diag.C - FC)[idx.m] ### Find ridge parameter heuristic bounds h.interval = .em.ridgeLimit(arg$h.default, arg$min.relvar, d.var, rank.eff, iter, d, h.tol, arg$n.eff, sum(tr.E0.sub), min.tr.S) ### Optimize? if (h.interval[1] < h.interval[2]) { ### If true, then h can be optimized (if not, simply retain the previously obtained h) h = optimize(.em.gcvFn, interval = h.interval, tol = h.tol, sum(tr.E0.sub), FR.sub, d, arg$n.eff, arg$opt)$minimum } } ### Save regularization parameter dat$h[idx, sub.m] = rep(h, length(idx) * len.m) ### Effective number of parameters dat$p.eff[idx, sub.m] = sum(d / (d + h ^ 2)) ### Retained variance dat$v.ret[idx, sub.m] = sum(d ^ 2 / (d + h ^ 2)) ### Partial filter factors filt = 1 / (d + h ^ 2) ### Only 1 eigenvalue? if (rank.eff == 1) { V.filt = matrix(V * filt, ncol = 1) ### More than 1 eigenvalue? } else { V.filt = .em.mScale(V, filt, type = 1, scale = TRUE, add = FALSE, center = FALSE, max.size = arg$max.size) } ### Calculate regression coefficients & solution - if more than 1 subproblem, Xhat is meaningless for the predictors if (sub.p == 1) { ### Get Xhat for predictors & predictands B = V.filt %*% t(V) %*% matrix(dat$C[a, ], nrow = n.avail) Xhat[idx, ] = matrix(dat$X[idx, a], nrow = length(idx)) %*% B } else { ### Get Xhat for predictands only B = V.filt %*% t(V) %*% matrix(dat$C[a, sub.m], nrow = n.avail) Xhat[idx, sub.m] = matrix(dat$X[idx, a], nrow = length(idx)) %*% B } } ###### ### Step 3: Residuals ###### ### Get time step regularization parameter, effective parameters, and retained variance h2 = dat$h[idx[1], m.all] ^ 2 p.eff = dat$p.eff[idx[1], m.all] ### Set up d for Hadamard product d.matrix = matrix(rep(d, ncol(Fo)), ncol = ncol(Fo)) ### Set up h for Hadamard product h.matrix = matrix(rep(h2, nrow(Fo)), nrow = nrow(Fo), byrow = TRUE) ### Partial filter factors filt = 1 / (d.matrix + h.matrix) ### d * h ^ 2: dh.matrix = d.matrix * h.matrix ### Calculate residual errors S0 = matrix(dat$C[m.all, m.all] - t(Fo) %*% Fo, ncol = n.miss) S.EE = matrix(t(Fo * filt * h.matrix) %*% (Fo * filt * h.matrix), ncol = n.miss) sum.E = S0 + S.EE ### Calculate relative prediction error var.R = diag(sum.E) sign.R = sign(var.R) DofR = matrix(rep(arg$n.eff - p.eff, length(idx)), ncol = n.miss, byrow = TRUE) X.err[idx, m.all] = (arg$n / DofR) * matrix(rep(sign.R * sqrt(abs(var.R)), length(idx)), ncol = n.miss, byrow = TRUE) ### Update residual covariances C.res[m.all, m.all] = C.res[m.all, m.all] + (length(idx) * arg$inflate * sum.E) } else { ### No missing values, so no solution or residuals Xhat[idx, ] = dat$X[idx, ] B = 0 } ### Store B if (!is.na(arg$B.series[1]) & arg$B.series[1] == 0) { ### Record applicable variables & time steps attr(B, "miss") = m.all attr(B, "rows") = idx dat$B[[i]] = B } if (!is.na(arg$B.series[1]) & arg$B.series[1] != 0) { ### Assemble the B's for (j in arg$B.series) { dat$B[[j]][idx, a] = matrix(B[, j], ncol = length(B[, j]), nrow = length(idx), byrow = TRUE) } } } ### Save X.err & C.res dat$X.err = X.err dat$C.res = C.res / arg$n.eff ### Compute average effective number of parameters, retained variance dat$p.eff.ave[iter] = sum(rowSums(dat$p.eff)) / arg$n.mis dat$v.ret.ave[iter] = sum(rowSums(dat$v.ret)) / arg$n.mis ### Unscale if (arg$covr == FALSE | arg$wts.flag == TRUE) { dat$X = .em.mScale(dat$X, 1 / D, type = 1, add = FALSE, center = FALSE, max.size = arg$max.size) Xhat = .em.mScale(Xhat, 1 / D, type = 1, add = FALSE, center = FALSE, max.size = arg$max.size) dat$C.res = .em.mScale((1 / D) * dat$C.res, 1 / D, type = 1, add = FALSE, center = FALSE, max.size = arg$max.size) dat$X.err = .em.mScale(dat$X.err, 1 / D, type = 1, add = FALSE, center = FALSE, max.size = arg$max.size) } ### Compute solution rms change in Xhat dat$dXhat[iter] = sqrt(.em.ssq(Xhat[arg$missing] - dat$X[arg$missing]) / arg$n.mis) ### Compute change in mean of missing values temp.X = dat$X + dat$M dat$nXhat[iter] = sqrt(.em.ssq(temp.X[arg$missing]) / arg$n.mis) ### Compute stagnation value dat$rXhat[iter] = dat$dXhat[iter] / dat$nXhat[iter] ### Save X & update X & Xhat dat$X[arg$missing] = Xhat[arg$missing] dat$Xhat = Xhat ### Recenter center = .em.center(dat$X, 1) dat$X = center[[1]] dat$Xhat = dat$Xhat - center[[2]] colnames(dat$X) = colnames(dat$Xhat) = char$c.names rownames(dat$X) = rownames(dat$Xhat) = char$r.names ### Generate C dat$C = (t(dat$X) %*% dat$X) / arg$n.eff + dat$C.res ### Iteration estimated relative imputation error dat$rel.err[iter] = sqrt(sum(X.err[arg$missing] ^ 2) / arg$n.mis) ### Iteration estimated absolute imputation error dat$abs.err[iter] = sqrt(sum(dat$X.err[arg$missing] ^ 2) / arg$n.mis) ### Holdout error if (arg$holdout == TRUE) { err = .em.holdout(dat$X, arg$covr, arg$wts, char, arg$DoFL, arg$missing, arg$cv.map, arg$n.err, arg$robust) dat$cv.err[iter] = err[[1]] dat$cv.abs.err[iter] = err[[2]] } ### Cleanup & return dat } ############################### ### ### TSVD REGRESSION FUNCTION ### ############################### .em.tsvdRegFn = function (arg, dat, char, n.eof, iter) { ### Setup C if first iteration dat$C = dat$C.res = NA ### Setup Xhat Xhat = matrix(0, nrow = arg$n, ncol = arg$p) ### Find scales and weighting factors if (arg$covr == TRUE) { D = arg$wts / rep(1, arg$p) } else { D = sqrt(colSums(dat$X ^ 2) / arg$n.eff) D = arg$wts / D } ### Scale if (arg$covr == FALSE | arg$wts.flag == TRUE) { dat$X = .em.mScale(dat$X, D, type = 1, add = FALSE, center = FALSE, max.size = arg$max.size) dat$Xhat = .em.mScale(dat$Xhat, D, type = 1, add = FALSE, center = FALSE, max.size = arg$max.size) } ### Approximate X svd.X = svd(dat$X) dat$d = (svd.X$d ^ 2) / arg$n ### No need to convert to matrix if(n.eof > 1) { Xhat = svd.X$u[, 1:n.eof] %*% diag(svd.X$d[1:n.eof]) %*% t(svd.X$v[, 1:n.eof]) } ### Need to convert to matrix if(n.eof == 1) { Xhat = as.matrix(svd.X$u[, 1]) %*% as.matrix(svd.X$d[1]) %*% t(svd.X$v[, 1]) } ### Estimated error dat$rel.err[iter] = sqrt(sum((Xhat - dat$X) ^ 2) / (arg$n.eff * arg$p)) ### Unscale if (arg$covr == FALSE | arg$wts.flag == TRUE) { dat$X = .em.mScale(dat$X, 1 / D, type = 1, add = FALSE, center = FALSE, max.size = arg$max.size) Xhat = .em.mScale(Xhat, 1 / D, type = 1, add = FALSE, center = FALSE, max.size = arg$max.size) } ### Compute solution rms change in Xhat dat$dXhat[iter] = sqrt(.em.ssq(Xhat[arg$missing] - dat$X[arg$missing]) / arg$n.mis) ### Compute change in mean of missing values temp.X = dat$X + dat$M dat$nXhat[iter] = sqrt(.em.ssq(temp.X[arg$missing]) / arg$n.mis) ### Compute stagnation value dat$rXhat[iter] = dat$dXhat[iter] / dat$nXhat[iter] ### Update X & Xhat dat$X[arg$missing] = Xhat[arg$missing] dat$Xhat = Xhat ### Recenter center = .em.center(dat$X, arg$DoFL) dat$X = center[[1]] dat$Xhat = dat$Xhat - center[[2]] colnames(dat$X) = colnames(dat$Xhat) = char$c.names rownames(dat$X) = rownames(dat$Xhat) = char$r.names ### Holdout error if (arg$holdout == TRUE) { err = .em.holdout(dat$X, char, arg$DoFL, arg$missing, arg$cv.map, arg$n.err, arg$robust) dat$cv.err[iter] = err[[1]] dat$cv.abs.err[iter] = err[[2]] } ### Cleanup & return dat } ########################################################################################################### ### ### ### SECTION 5: EXPECTATION-MAXIMIZATION MODULE ### ### ### ########################################################################################################### emFn = function (X, X0 = NA, C = NA, ccov = TRUE, wts = NA, type = "mridge", covr = FALSE, maxiter = 100, tol = 0.001, DoFL = 12, n.eof = NA, robust = FALSE, h.default = NA, gcv = "golub", min.relvar = 0.05, min.var = 0, inflate = 1, progressive = c(FALSE, 1, FALSE, 1), gap = 10, mem = FALSE, B.series = NA, plots = TRUE, plot.fns = NA, plot.series = 1, user.stats = NA, user.args = NA, save.file = c(FALSE, FALSE), file.name = NA, max.size = 1000) { ############################### ### Section 1: Initialization ############################### ### Data size & missing values n = nrow(X) p = ncol(X) n.eff = max(1, n - DoFL) missing = is.na(X) ##################################### ### TS TO MATRIX (FASTER PROCESSING) ##################################### if (is.ts(X)) { ### Record characteristics x.ts = TRUE st = start(X) en = end(X) fr = frequency(X) r.names = rownames(X) c.names = colnames(X) ### Convert to non-ts (allows faster computations) X = matrix(X, nrow = nrow(X), ncol = ncol(X)) } else { x.ts = FALSE st = en = fr = NA r.names = rownames(X) c.names = colnames(X) } #################### ### CHECK ARGUMENTS #################### ### Make sure at least 1 iteration allowed if (maxiter < 1) { warning("Supplied iteration limit too low (allowed range >= 1).\nSetting to 1.\n") maxiter = 1 } ### If TTLS or TSVD, ensure truncation parameter selected if (is.na(n.eof) & type != "mridge" & type != "iridge") { warning("Unspecified maximum EOF limit for TTLS or TSVD.\n Resetting maximum EOFs to full rank.\n") n.eof = min(n - DoFL - 1, p - 1) } ### Sort supplied B series in ascending order B.series = sort(unique(B.series)) ### If not numeric, turn B series off if (is.numeric(B.series) == FALSE) {B.series = NA} ### If impossible column #'s entered, turn B series off if (!identical(B.series, NA) & (min(B.series) < 0 | max(B.series) > p | length(B.series) > p)) { warning("Invalid variables for storing B matrices.\nStorage of B matrices will not be performed.\n") B.series = NA } ################# ### PREP WEIGHTS ################# ### No weights? if (identical(wts, NA)) { wts = rep(1, p) wts.flag = FALSE } else { wts.flag = TRUE } ### Check for correct length of weights if (length(wts) != p) {stop("Length of weighting vector does not equal number of variables")} ########### ### PREP X ########### ### Check for sufficient # of points, center, & estimate variance n.orig = .em.dofChk(missing, DoFL, p) center = .em.center(X, DoFL, !missing, robust) scales = sqrt(colSums(X ^ 2, na.rm = TRUE) / (n.orig - DoFL)) ### Update X = center[[1]] center = center[[2]] ########### ### Prep C ########### ### Construct covariance matrix prep.C = .em.prepC(X, C, scales, missing, ccov, n, p, DoFL) X = prep.C[[1]] C = prep.C[[2]] ################ ### Prep X0, C0 ################ ### Supplied X0? X0.on = FALSE missing0 = n.mis0 = NA if (!identical(X0, NA)) { ### Missing values? missing0 = is.na(X0) n.mis0 = sum(missing0) n.orig0 = colSums(!missing0) ### Center on all values X0 = .em.center(X0, DoFL, !missing0, robust)[[1]] ### Get actual variance X0.scales = sqrt(colSums(X0 ^ 2, na.rm = TRUE) / (n.orig0 - DoFL)) ### Center on calibration period X0 = .em.center(X0, DoFL, !missing, robust)[[1]] ### Turn on flag X0.on = TRUE } else { ### No supplied X0, so use scale of X X0.scales = scales } ################# ### PREP OPTIONS ################# ### Valid option entered? if (type != "tsvd" & type != "ttls" & type != "iridge" & type != "mridge") {stop(paste("Unknown regression type ", type, ".", sep = ""))} ### GCV option opt = TRUE if (gcv == "mallows") {opt = FALSE} ### Ridge options if (type == "iridge" | type == "mridge") { regFn = .em.ridgeRegFn conv.message = "ridge" ridge.trunc = n.eof if (is.na(ridge.trunc)) {ridge.trunc = max(1, min(n - DoFL, p))} n.eof = 1 plot.type = 2 } ### TTLS options if (type == "ttls") { regFn = .em.ttlsRegFn conv.message = "eof" ridge.trunc = min(n.eof) plot.type = 1 } ### TSVD options if (type == "tsvd") { regFn = .em.tsvdRegFn B.series = NA conv.message = "eof" ridge.trunc = min(n.eof) plot.type = 1 } ### Used if progressively iterating (applicable to TTLS and TSVD only) start.eof = end.eof = store.eof = n.eof if (progressive[1] == TRUE & (type == "ttls" | type == "tsvd")) { ### Get the starting and ending EOFs for progressive start.eof = progressive[2] end.eof = n.eof ### Check for valid entries if (start.eof > n.eof) { warning("Invalid starting EOF for progressive (greater than ending EOF).\nProgressive not used.\n") start.eof = end.eof = store.eof = n.eof } ### Store the results? if (progressive[3] == TRUE) {store.eof = progressive[4]} ### Storage EOFs valid? if (store.eof > end.eof) { warning("Invalid starting EOF for retaining infilled estimates.\nOnly storing final estimate.\n") store.eof = n.eof } } ### Save options - if file name not provided, create one if ((save.file[1] == TRUE | save.file[2] == TRUE) & is.na(file.name)) { ### Make sure auto-named file doesn't already exist a = 1 while (file.exists(paste("EM-", type, "-", Sys.Date(), "-", a, ".RData", sep = ""))) {a = a + 1} ### Create name file.name = paste("EM-", type, "-", Sys.Date(), "-", a, ".RData", sep = "") rm(a) } ### Residual covariance inflation options if (identical(inflate, NA)) {inflate = c(1, 1)} inflate[is.na(inflate)] = 1 ########################################### ### FIND UNIQUE AVAILABLE/MISSING PATTERNS ########################################### ### Search for unique combinations if not TSVD if (type == "tsvd") { temp.unique = list() temp.unique$common = temp.unique$avail = temp.unique$miss = NA } else { temp.unique = .em.unique(X, missing) } ####################################### ### PREP MISSING/ACTUAL VARIABLE LISTS ####################################### ### Set up missing / actual value lists temp.unique$miss = .em.subProblems(temp.unique, type) ################## ### PREP PLOTTING ################## ### User-defined or default functions? if (plots == TRUE & identical(plot.fns, NA)) { ### Grab default functions plot.fns = list() plot.fns[[1]] = .em.plotSeries plot.fns[[2]] = .em.plotSV plot.fns[[3]] = .em.plotVar plot.fns[[4]] = .em.plotEstErr ### Holdout error? if (X0.on == TRUE) {plot.fns[[4]] = .em.plotActErr} ### Ridge style? if (plot.type == 2) {plot.fns[[2]] = .em.plotParams} ### TSVD? if (type == "tsvd") {plot.fns[[3]] = .em.plotVarTsvd} } ############################# ### Establish argument lists ############################# ### Option/info list .em.arg = list() .em.arg$n = n .em.arg$p = p .em.arg$wts = wts .em.arg$wts.flag = wts.flag .em.arg$type = type .em.arg$trunc = ridge.trunc .em.arg$covr = covr .em.arg$maxiter = maxiter .em.arg$tol = tol .em.arg$DoFL = DoFL .em.arg$n.eff = n.eff .em.arg$n.eof = n.eof .em.arg$robust = robust .em.arg$h.default = h.default .em.arg$gcv = gcv .em.arg$opt = opt .em.arg$min.relvar = min.relvar .em.arg$min.var = min.var .em.arg$inflate = inflate .em.arg$B.series = B.series .em.arg$max.size = max.size .em.arg$missing = missing .em.arg$common = temp.unique$common .em.arg$miss = temp.unique$miss .em.arg$avail = temp.unique$avail .em.arg$n.mis = sum(missing) .em.arg$holdout = X0.on .em.arg$cv.map = !missing0 & missing .em.arg$n.err = sum(missing) - n.mis0 .em.arg$start.eof = start.eof .em.arg$store.eof = store.eof .em.arg$end.eof = end.eof ### Data list .em.results = list() .em.dat = list() .em.dat$X = X .em.dat$Xhat = X .em.dat$C = C .em.dat$C.res = matrix(0, nrow = p, ncol = p) .em.dat$B = list() .em.dat$dXhat = 1e6 .em.dat$nXhat = 1e6 .em.dat$rXhat = 1e6 .em.dat$p.eff.ave = vector() .em.dat$v.ret.ave = vector() .em.dat$cv.err = vector() .em.dat$cv.abs.err = vector() .em.dat$rel.err = vector() .em.dat$abs.err = vector() .em.dat$d = vector() .em.dat$M = center ### Characteristics list .em.char = list() .em.char$st = st .em.char$en = en .em.char$fr = fr .em.char$ts = x.ts .em.char$r.names = r.names .em.char$c.names = c.names .em.char$scales = scales .em.char$X0 = X0 .em.char$X0.scales = X0.scales ### Clean up rm(X, X0, C, ccov, wts, wts.flag, type, ridge.trunc, covr, maxiter, DoFL, n.eff, n.eof, robust, h.default, gcv, opt, min.relvar, min.var, inflate, B.series, max.size, missing, temp.unique, X0.on, missing0, n.mis0, center, st, en, fr, x.ts, r.names, c.names, scales, X0.scales, n.orig, prep.C, n, p) temp = gc() ### Print estimated memory usage? if (mem == TRUE) { ### Get object sizes size.dat = round(object.size(.em.dat) / 1048576, 5) size.arg = round(object.size(.em.arg) / 1048576, 5) size.char = round(object.size(.em.char) / 1048576, 5) used.mem = size.dat + size.arg + size.char ### Output memory usage message(paste("Initial Memory usage: ", used.mem, "Mb")) message("-----------------------------------------------") message("") } ########################### ### Section 2: Regression ########################### for (i in start.eof:end.eof) { ### Initialize .em.results[[i]] = list() iter = 1 stag = 1e6 stag.pre = 1e6 ### Prep statistics? if (!identical(user.stats, NA)) { ### Call first user stats prep function (user.stats must be a list with a prep function, condition-met function, and calc function) prepStats = user.stats[[1]] prep = prepStats(.em.arg, .em.dat, .em.char, i, iter, user.args) ### Assign stats storage to .em.results .em.dat$stats = prep[[1]] ### Pass any other user-defined info from prepStats() to emFn() stats.info = prep[[2]] } ### Iterate while (iter <= .em.arg$maxiter & stag > tol) { ### Regression .em.results[[i]] = regFn(.em.arg, .em.dat, .em.char, i, iter) ### Find changes in estimated values rel = round(.em.results[[i]]$nXhat[iter], 8) stag = round(.em.results[[i]]$rXhat[iter], 8) rms = round(.em.results[[i]]$dXhat[iter], 8) ### Plotting? if (plots == TRUE) { ### How many plots to create? num.plots = length(plot.fns) ### Set up plotting area par(mar = c(0.5, 4, 2, .15) + 0.1) close.screen(all = TRUE) split.screen(c(num.plots, 1)) ### Cycle through each plot for (np in 1:num.plots) { ### Open screen screen(np) ### Get plotting function and parameters plotFn = plot.fns[[np]] ### Plot plotFn(.em.arg, .em.results[[i]], .em.char, i, iter, plot.series[np]) } ### Restore plot area defaults close.screen(all = TRUE) par(mar = c(5, 4, 4, 2) + 0.1) } ### Print iteration/memory usage or save iteration? if (iter / gap == trunc(iter / gap)) { ### Iteration info message(paste(iter, ": Relative: ", rel, "; RMS: ", rms, "; Stagnation: ", stag, ".", sep = "")) ### Memory info if (mem == TRUE) { ### Get object sizes size.dat = round(object.size(.em.dat) / 1048576, 5) size.arg = round(object.size(.em.arg) / 1048576, 5) size.char = round(object.size(.em.char) / 1048576, 5) size.em = round(2 * object.size(.em.results) / 1048576, 5) used.mem = size.dat + size.arg + size.char + size.em ### Output memory usage message(paste(" - Current Memory usage: ", used.mem, "Mb -")) } ### Save at each "gap" iteration? if (save.file[2] == TRUE) { ### Setup new list em = list() em[[iter]] = .em.results[[i]] em[[iter]]$arg = list() em[[iter]]$arg = .em.arg em[[iter]]$char = list() em[[iter]]$char = .em.char ### Recenter to original centering center = .em.center(em[[iter]]$X, em[[iter]]$arg$DoFL, idx = !em[[iter]]$arg$missing, em[[iter]]$arg$robust) em[[iter]]$X = center[[1]] + em[[iter]]$M em[[iter]]$Xhat = em[[iter]]$Xhat - center[[2]] + em[[iter]]$M em[[iter]]$char$X0 = .em.center(em[[iter]]$char$X0, em[[iter]]$arg$DoFL, idx = !em[[iter]]$arg$missing, em[[iter]]$arg$robust)[[1]] + em[[iter]]$M ### Restore row/column names rownames(em[[iter]]$X) = rownames(em[[iter]]$Xhat) = em[[iter]]$char$r.names colnames(em[[iter]]$X) = colnames(em[[iter]]$Xhat) = em[[iter]]$char$c.names ### Restore ts if (em[[iter]]$char$ts == TRUE) { em[[iter]]$X = ts(em[[iter]]$X, start = em[[iter]]$char$st, freq = em[[iter]]$char$fr) em[[iter]]$Xhat = ts(em[[iter]]$Xhat, start = em[[iter]]$char$st, freq = em[[iter]]$char$fr) } ### Restore ts to B's if (em[[iter]]$char$ts == TRUE & !is.na(em[[iter]]$arg$B.series[1]) & em[[iter]]$arg$B.series[1] != 0) { ### Cycle through each retained B for (j in em[[iter]]$arg$B.series) { em[[iter]]$B[[j]] = ts(em[[iter]]$B[[j]], start = em[[iter]]$char$st, freq = em[[iter]]$char$fr) rownames(em[[iter]]$B[[j]] ) = em[[iter]]$char$r.names colnames(em[[iter]]$B[[j]] ) = em[[iter]]$char$c.names } } ### Save save.name = paste("EOF-", i, "-iter-", iter, "-", file.name, sep = "") save(em, file = save.name) rm(em) } } ### Get statistics? if (!identical(user.stats, NA)) { ### Check to see if conditions are met for calculating stats condMet = user.stats[[2]] cond.met = condMet(.em.arg, .em.results[[i]], .em.char, i, iter, stats.info, user.args) ### If cond.met is TRUE, call the stats function if (cond.met[[1]] == TRUE) { ### Get the stats function & call calcStats = user.stats[[3]] stats = calcStats(.em.arg, .em.results[[i]], .em.char, i, iter, cond.met, stats.info, user.args) ### Update stats and other user-defined info .em.results[[i]]$stats = stats[[1]] stats.info = stats[[2]] } } ### Update .em.dat = .em.results[[i]] ### Increment iter = iter + 1 } ### Define ridge messages if (conv.message == "ridge") { conv.text = (paste(iter - 1, " iterations to convergence with ", round(.em.dat$p.eff.ave[iter - 1], 3), " average number of parameters.", sep = "")) noconv.text = (paste(iter - 1, " iterations without convergence with ", round(.em.dat$p.eff.ave[iter - 1], 3), " average number of parameters.", sep = "")) params.text = (paste("Maximum possible parameters: ", .em.arg$trunc, ".\n", sep = "")) } ### Define truncation messages if (conv.message == "eof") { conv.text = (paste(iter - 1, " iterations to convergence at ", i, " retained EOFs.", sep = "")) noconv.text = (paste(iter - 1, " iterations without convergence at ", i, " retained EOFs.", sep = "")) params.text = (paste("Maximum possible parameters: ", .em.arg$trunc, ".\n", sep = "")) } ### Save convergence information & inform message("\nFINAL ITERATIONS:") if (stag <= .em.arg$tol) { ### Reached convergence tolerance message(conv.text) message(params.text) .em.results[[i]]$conv = TRUE .em.results[[i]]$conv.iter = iter } else { ### Did not reach convergence tolerance message(noconv.text) message(params.text) .em.results[[i]]$conv = FALSE .em.results[[i]]$conv.iter = -(iter) } ### Print change in mean, RMS, and final stagnation value message("FINAL DELTAS:") message(paste("Relative change: ", rel, "\nRMS change: ", rms, "\nStagnation value: ", stag, "\n", sep = "")) ### Print holdout error if (.em.arg$holdout == TRUE) { message("FINAL MEASURED HOLDOUT ERRORS:") message(paste("Relative RMS holdout error:", round(.em.dat$cv.err[iter - 1], 4))) message(paste( "Absolute RMS holdout error:", round(.em.dat$cv.abs.err[iter - 1], 4), "\n")) } ### Print estimated error message("FINAL ESTIMATED ERRORS:") message(paste("Relative RMS estimated error:", round(.em.dat$rel.err[iter - 1], 4))) message(paste( "Absolute RMS estimated error:", round(.em.dat$abs.err[iter - 1], 4), "\n")) ### Restore original data characeteristics if (store.eof <= i | (save.file[1] == TRUE)) { ### Recenter to original centering center = .em.center(.em.results[[i]]$X, .em.arg$DoFL, idx = !.em.arg$missing, .em.arg$robust) .em.results[[i]]$X = center[[1]] + .em.dat$M .em.results[[i]]$Xhat = .em.results[[i]]$Xhat - center[[2]] + .em.dat$M if (.em.arg$holdout == TRUE) {.em.char$X0 = .em.center(.em.char$X0, .em.arg$DoFL, idx = !.em.arg$missing, .em.arg$robust)[[1]] + .em.dat$M} ### Restore row/column names rownames(.em.results[[i]]$X) = rownames(.em.results[[i]]$Xhat) = .em.char$r.names colnames(.em.results[[i]]$X) = colnames(.em.results[[i]]$Xhat) = .em.char$c.names ### Restore ts if (.em.char$ts == TRUE) { .em.results[[i]]$X = ts(.em.results[[i]]$X, start = .em.char$st, freq = .em.char$fr) .em.results[[i]]$Xhat = ts(.em.results[[i]]$Xhat, start = .em.char$st, freq = .em.char$fr) } ### Restore ts for B's if (.em.char$ts == TRUE & !is.na(.em.arg$B.series[1]) & .em.arg$B.series[1] != 0) { ### Cycle through each B for (j in .em.arg$B.series) { .em.results[[i]]$B[[j]] = ts(.em.results[[i]]$B[[j]], start = .em.char$st, freq = .em.char$fr) rownames(.em.results[[i]]$B[[j]] ) = .em.char$r.names colnames(.em.results[[i]]$B[[j]] ) = .em.char$c.names } } ### Save converged solution? if (save.file[1] == TRUE) { save.name = paste("EOF-", i, "-", file.name, sep = "") converged = list() converged[[i]] = .em.results[[i]] converged[[i]]$arg = list() converged[[i]]$arg = .em.arg converged[[i]]$char = list() converged[[i]]$char = .em.char save(converged, file = save.name) rm(converged) gc() } } ### Setup for next EOF (progressive) .em.dat$dXhat = .em.results[[i]]$dXhat[iter - 1] .em.dat$nXhat = .em.results[[i]]$nXhat[iter - 1] .em.dat$rXhat = .em.results[[i]]$rXhat[iter - 1] .em.dat$cv.err = vector() .em.dat$rel.err = vector() .em.dat$abs.err = vector() .em.dat$rho = list() if (store.eof > i) {.em.results[[i]] = list()} } ### Return .em.results[[i]]$arg = list() .em.results[[i]]$arg = .em.arg .em.results[[i]]$char = list() .em.results[[i]]$char = .em.char .em.results }