# --------------------------------------- # Author: Andreas Alfons # Vienna University of Technology # --------------------------------------- ## context sensitive algorithm for robust model selection csselect <- function(X, y, k, k1, alpha=0.3, crit="bic", args.brlars=list(), args.lmrob=list(), args.regsubsets=list()) { # INPUT # X = matrix with candidate predictors # y = vector with corresponding response values # k = maximum number of variables for the final model # k1 = number of variables to be sequenced in the initial B-RLARS step # alpha = significance level for filtering variables in MM regression # crit = criterion to be used in the weighted k-subset regression (possible # values are "adjr2", "rsq", "rss","cp" and "bic") # args.brlars = list of additional arguments to be passed to function 'brlars' # args.brlars can contain the following elements: # B = number of bootstrap samples # dB = optional parameter that sets the number of candidate predictors for the # bootstrap samples. If this number is lower than ncol(X), then RLARS is # first ran on the original data with all candidate predictors and # only the predictors in the top sequence of length dB are used in the # bootstrap procedure. # mm = number of predictors sequenced for each bootstrap sample # seed = value of the random seed # args.lmrob = list of additional arguments to be passed to function 'lmrob' # (see help) # args.regsubsets = list of additional arguments to be passed to function # 'regsubsets' (see help) # required libraries require(robustbase) require(leaps) # error messages if(!(crit %in% c("adjr2", "rsq", "rss","cp", "bic"))) stop(paste("criterium ", crit, " is not supported.", sep="'")) ## robustly scale variables tmp <- standardize.rob(X, y) # function defined in B-RLARS.R X <- as.data.frame(tmp$X) y <- tmp$y # step 1: B-RLARS if(missing(k1)) k1 <- min(20, sqrt(ncol(X))) args.brlars$X <- X args.brlars$y <- y args.brlars$m <- k1 if(is.null(args.brlars$dB) && nrow(X) < ncol(X)) args.brlars$dB <- nrow(X) if(is.null(args.brlars$mm)) args.brlars$mm <- k1 seq.brlars <- do.call(brlars, args.brlars)$active # B-RLARS sequence X.brlars <- X[, seq.brlars] names.brlars <- names(X.brlars) # step 2: MM args.lmrob$formula <- y ~ . #- 1 args.lmrob$data <- cbind(y, X.brlars) res.mm <- do.call(lmrob, args.lmrob) pr <- summary(res.mm)$coefficients[names.brlars, 4] # probabilities significant <- which(pr <= alpha) # significant predictors X.mm <- X.brlars[,significant] names.mm <- names(X.mm) seq.mm <- seq.brlars[names.brlars %in% names.mm] args.lmrob$data <- cbind(y, X.mm) # reduced data set res.mm <- do.call(lmrob, args.lmrob) weights.mm <- res.mm$weights # step 3: k-subset if(missing(k)) k <- min(10, sqrt(ncol(X))) args.regsubsets$x <- X.mm * weights.mm args.regsubsets$y <- y * weights.mm args.regsubsets$nvmax <- k res.ksub <- summary(do.call(regsubsets, args.regsubsets)) best <- if(crit %in% c("adjr2", "rsq")) which.max(res.ksub[[crit]]) else which.min(res.ksub[[crit]]) names.ksub <- names.mm[res.ksub$which[best, names.mm]] seq.ksub <- seq.brlars[names.brlars %in% names.ksub] # result cs <- list(indices=seq.ksub, names=names.ksub, brlars=seq.brlars, significant=seq.mm) class(cs) <- "csselect" return(cs) } ## print method print.csselect <- function(x, ...) { print(unclass(x)[1:2], ...) } ## functions required for dendrogram # robust correlations cor.rob <- function(x, alpha=0.75) { # INPUT # x = vector, matrix or data.frame # alpha = numeric control parameter for function 'covMcd' (see help) require(robustbase) covMcd(x, cor=T, alpha=alpha)$cor } # distances based on (robust) correlations dist.cor <- function(x, robust=TRUE, alpha=0.75) { # INPUT # x = matrix or data.frame # robust = a logical indicating whether robust correlation should be used # alpha = numeric control parameter for function 'covMcd' (see help) as.dist(1 - abs(if(robust) cor.rob(x, alpha) else cor(x))) } ## dendrogram based on (robust) correlations csdendro <- function(x, data, method="complete", robust=TRUE, alpha=0.75, main=NULL, sub=NULL, xlab=NULL, ylab=NULL, ...) { # INPUT # x = if 'data' is supplied, an object of class "cssselect" or a vector # specifying the columns of 'data'; otherwise, a matrix or data.frame # data = matrix or data.frame # method = agglomeration method for hierarchical clustering (see help for # 'hclust') # robust = a logical indicating whether robust correlation should be used # alpha = numeric control parameter for function 'covMcd' (see help) # main, sub, xlab, ylab = plot annotation # further arguments passed to 'plot' (see help for 'plot.hclust') if(!missing(data)) { x <- if(inherits(x, "csselect")) data[,x$brlars] else data[,x] } d <- dist.cor(x, robust, alpha) cl <- hclust(d, method=method) if(is.null(main)) main <- "" if(is.null(sub)) sub <- "" if(is.null(xlab)) xlab <- "Candidate predictors" if(is.null(ylab)) ylab <- "Dissimilarity" plot(cl, main=main, sub=sub, xlab=xlab, ylab=ylab, ...) invisible(cl) }