# ---------------------------------------
# Author: Andreas Alfons
#         Vienna University of Technology
# ---------------------------------------

## context sensitive algorithm for robust model selection 
csselect <- function(X, y, k, k1, alpha=0.3, crit="bic", 
        args.brlars=list(), args.lmrob=list(), args.regsubsets=list()) {
# INPUT
# X = matrix with candidate predictors
# y = vector with corresponding response values
# k = maximum number of variables for the final model
# k1 = number of variables to be sequenced in the initial B-RLARS step
# alpha = significance level for filtering variables in MM regression
# crit = criterion to be used in the weighted k-subset regression (possible 
#        values are "adjr2", "rsq", "rss","cp" and "bic")
# args.brlars = list of additional arguments to be passed to function 'brlars'
# args.brlars can contain the following elements:
#   B = number of bootstrap samples
#   dB = optional parameter that sets the number of candidate predictors for the
#        bootstrap samples. If this number is lower than ncol(X), then RLARS is 
#        first ran on the original data with all candidate predictors and 
#        only the predictors in the top sequence of length dB are used in the 
#        bootstrap procedure.
#   mm = number of predictors sequenced for each bootstrap sample
#   seed = value of the random seed
# args.lmrob = list of additional arguments to be passed to function 'lmrob' 
#              (see help)
# args.regsubsets = list of additional arguments to be passed to function 
#                   'regsubsets' (see help)
    
    # required libraries
    require(robustbase)
    require(leaps)
    # error messages
    if(!(crit %in% c("adjr2", "rsq", "rss","cp", "bic"))) 
        stop(paste("criterium ", crit, " is not supported.", sep="'"))
    
    ## robustly scale variables
    tmp <- standardize.rob(X, y)  # function defined in B-RLARS.R
    X <- as.data.frame(tmp$X)
    y <- tmp$y
    
    # step 1: B-RLARS
    if(missing(k1)) k1 <- min(20, sqrt(ncol(X)))
    args.brlars$X <- X
    args.brlars$y <- y
    args.brlars$m <- k1
    if(is.null(args.brlars$dB) && nrow(X) < ncol(X)) args.brlars$dB <- nrow(X)
    if(is.null(args.brlars$mm)) args.brlars$mm <- k1
    seq.brlars <- do.call(brlars, args.brlars)$active  # B-RLARS sequence
    X.brlars <- X[, seq.brlars]
    names.brlars <- names(X.brlars)
    
    # step 2: MM
    args.lmrob$formula <- y ~ . #- 1
    args.lmrob$data <- cbind(y, X.brlars)
    res.mm <- do.call(lmrob, args.lmrob)
    pr <- summary(res.mm)$coefficients[names.brlars, 4]  # probabilities
    significant <- which(pr <= alpha)  # significant predictors
    X.mm <- X.brlars[,significant]
    names.mm <- names(X.mm)
    seq.mm <- seq.brlars[names.brlars %in% names.mm]
    args.lmrob$data <- cbind(y, X.mm)  # reduced data set
    res.mm <- do.call(lmrob, args.lmrob)
    weights.mm <- res.mm$weights

    # step 3: k-subset
    if(missing(k)) k <- min(10, sqrt(ncol(X)))
    args.regsubsets$x <- X.mm * weights.mm
    args.regsubsets$y <- y * weights.mm
    args.regsubsets$nvmax <- k
    res.ksub <- summary(do.call(regsubsets, args.regsubsets))
    best <- if(crit %in% c("adjr2", "rsq")) which.max(res.ksub[[crit]])
        else which.min(res.ksub[[crit]])
    names.ksub <- names.mm[res.ksub$which[best, names.mm]]
    seq.ksub <- seq.brlars[names.brlars %in% names.ksub]
    
    # result
    cs <- list(indices=seq.ksub, names=names.ksub, 
        brlars=seq.brlars, significant=seq.mm)
    class(cs) <- "csselect"
    return(cs)
}

## print method
print.csselect <- function(x, ...) {
    print(unclass(x)[1:2], ...)
}

## functions required for dendrogram
# robust correlations
cor.rob <- function(x, alpha=0.75) {
# INPUT
# x = vector, matrix or data.frame
# alpha = numeric control parameter for function 'covMcd' (see help)
    require(robustbase)
    covMcd(x, cor=T, alpha=alpha)$cor
}
# distances based on (robust) correlations
dist.cor <- function(x, robust=TRUE, alpha=0.75) {
# INPUT
# x = matrix or data.frame
# robust = a logical indicating whether robust correlation should be used
# alpha = numeric control parameter for function 'covMcd' (see help)
    as.dist(1 - abs(if(robust) cor.rob(x, alpha) else cor(x)))
}

## dendrogram based on (robust) correlations
csdendro <- function(x, data, method="complete", robust=TRUE, alpha=0.75, 
        main=NULL, sub=NULL, xlab=NULL, ylab=NULL, ...) {
# INPUT
# x = if 'data' is supplied, an object of class "cssselect" or a vector 
#     specifying the columns of 'data'; otherwise, a matrix or data.frame
# data = matrix or data.frame
# method = agglomeration method for hierarchical clustering (see help for 
#          'hclust')
# robust = a logical indicating whether robust correlation should be used
# alpha = numeric control parameter for function 'covMcd' (see help)
# main, sub, xlab, ylab = plot annotation
# further arguments passed to 'plot' (see help for 'plot.hclust')
        if(!missing(data)) {
        x <- if(inherits(x, "csselect")) data[,x$brlars] else data[,x]
    }
    d <- dist.cor(x, robust, alpha)
    cl <- hclust(d, method=method)
    if(is.null(main)) main <- ""
    if(is.null(sub)) sub <- ""
    if(is.null(xlab)) xlab <- "Candidate predictors"
    if(is.null(ylab)) ylab <- "Dissimilarity"
    plot(cl, main=main, sub=sub, xlab=xlab, ylab=ylab, ...)
    invisible(cl)
}
