brlars<-function(X, y, m=ncol(X),B=50,mm=ncol(X),dB=ncol(X),seed=12) { # INPUT # X = matrix with candidate predictors # y = vector with corresponding response values # m = number of predictors that will be sequenced # B = number of bootstrap samples # mm = number of predictors sequenced for each bootstrap sample # dB = optional parameter that sets the number of candidate predictors for the # bootstrap samples. If this number is lower than ncol(X), then RLARS is # first ran on the original data with all candidate predictors and # only the predictors in the top sequence of length dB are used in the # bootstrap procedure. # seed = value of the random seed # Check that a valid number of m has been provided if (m>ncol(X)){stop(paste('The number of predictors to sequence, m=',m,' ,exceeds the number of candidate predictors which is',ncol(X)))} n=length(y) # Check for high dimensional data and adjust m and mm when necessary if((m>=n) || (mm>=n)) {warning(paste('At most',n-1,'predictors can be sequenced')) m=n-1 if(mm>=n){mm=round(n/2)} } # Adjust dB for high dimensional data if necessary if((dB!=ncol(X)) && (dB>=n)){dB=n-1} # Basic RLARS procedure without bootstrap if(B==0){out=rlars(X,y,m)$active} # B-RLARS procedure else { set.seed(seed) sequence=1:dB if(dB != ncol(X)) {sequence=rlars(X,y,dB)$active} count=matrix(0,B,dB) position<-matrix(0,B,dB) for (k in 1:B) { select<-sample(1:n,replace=T) sequ<-rlars(X[select,sequence],y[select],mm)$active count[k,sequ]<-count[k,sequ]+1 position[k,sequ]<-position[k,sequ] + 1:mm } totalcount=apply(count,2,sum) aveposition=apply(position,2,sum) aveposition[totalcount>0]<-aveposition[totalcount>0]/totalcount[totalcount>0] out = (1:dB)[order(totalcount,-aveposition,decreasing=T)][1:m] out=sequence[out] } list(active=out) } rlars<-function(X, y, m=ncol(X)) { temp <- standardize.rob(X, y) X <- temp$X y <- temp$y inactive <- c(seq(1, (k <- ncol(X)))) active <- c() signvec<-c() #up.x <- c() actmat<-diag(rep(1,k)) up.cor <- rep(0, k) for(j in 1:k) { up.cor[j] <- bi.hubcor(X[, j], y) } ycorvec<-up.cor up.maxcor <- max(abs(up.cor)) newsub <- which(abs(up.cor) == max(abs(up.cor))) newsign <- sign(up.cor[newsub]) active <- c(active, newsub) inactive <- inactive[inactive != newsub] signvec<-c(signvec,newsign) #up.x <- cbind(up.x, newsign * X[, newsub]) for(j in inactive) {actmat[newsub,j]<-actmat[j,newsub]<-bi.hubcor(X[,newsub],X[, j])} ones<-rep(1,length(active)) G <- diag(ones) inv.G <- solve(G) up.A <- as.double((ones %*% inv.G %*% ones)^(-0.5)) up.w <- up.A * (inv.G %*% ones) #up.u <- up.x %*% up.w up.a <- rep(0, k) for(j in inactive){ corvec <- actmat[j,active]*signvec up.a[j] <- sum(corvec * up.w)} up.gam <- 9999999 for(i in inactive) { #cat("i=",i,"\n") if((aa <- (up.maxcor - up.cor[i])/(up.A - up.a[i])) > 0 && aa < up.gam) { up.gam <- aa #cat("what ", up.gam, "\n") newsub <- i newsign <- 1 } if((bb <- (up.maxcor + up.cor[i])/(up.A + up.a[i])) > 0 && bb < up.gam) { up.gam <- bb #cat("what ", up.gam, "\n") newsub <- i newsign <- -1 } } up.gam <- as.double(up.gam) #cat("current gamma =", up.gam, "\n") #up.mean <- up.mean + up.gam*up.u #cat("current mean =", up.mean, "\n") #up.resid <- y - up.mean up.maxcor <- up.maxcor - up.gam * up.A up.cor <- up.cor - up.gam * up.a #cat("step", "\t", j, "\n") for(l in 1:(m - 2)) { # cat(l,"\n") active <- c(active, newsub) signvec<-c(signvec,newsign) inactive <- inactive[inactive != newsub] #up.x <- cbind(up.x, newsign * X[, newsub]) for(j in inactive){ actmat[newsub,j]<-actmat[j,newsub]<-bi.hubcor(X[,newsub],X[,j])} ones <- rep(1, (d = length(active))) G <- (signvec%*%t(signvec))*actmat[active,active] d1=dim(G)[1] check <- eigen(G,symmetric=T) lamb <- check$values if (min(lamb)<0) { evec <- check$vectors lamb <- (apply(X[,active]%*%evec,2,mad))^2 G <- matrix(0,ncol=d1,nrow=d1) for (o in 1:d1) {G <- G + lamb[o]*(evec[,o]%*%t(evec[,o]))} } inv.G <- solve(G) # inv.G=solve(G, tol = 1e-25) up.A <- as.double((ones %*% inv.G %*% ones)^(-0.5)) up.w <- up.A * (inv.G %*% ones) #up.u <- up.x %*% up.w up.a <- rep(0, k) for(j in inactive){ corvec <- actmat[j,active]*signvec up.a[j] <- sum(corvec * up.w)} up.gam <- 9999999 for(i in inactive) { #cat("i=",i,"\n") if((aa <- (up.maxcor - up.cor[i])/(up.A - up.a[i])) > 0 && aa < up.gam) { up.gam <- aa #cat("what ", up.gam, "\n") newsub <- i newsign <- 1 } if((bb <- (up.maxcor + up.cor[i])/(up.A + up.a[i])) > 0 && bb < up.gam) { up.gam <- bb #cat("what ", up.gam, "\n") newsub <- i newsign <- -1 } } up.gam <- as.double(up.gam) #cat("current gamma =", up.gam, "\n") #up.mean <- up.mean + up.gam*up.u #cat("current mean =", up.mean, "\n") #up.resid <- y - up.mean up.maxcor <- up.maxcor - up.gam * up.A up.cor <- up.cor - up.gam * up.a } active <- c(active, newsub) list(active=active) } standardize.rob<-function(X, y) { X <- apply(X,2,robstand) y <- robstand(y) list(X=X, y=y) } # Following function adjusts for 0-1 variable robstand<-function(x) { if ((m<-mad(x))==0){ (x-mean(x))/sd(x) } else {(x-median(x))/m } } bi.hubcor<-function(x, y, a = 2, p = 0.95) { robcov <- diag(c(1,1)) robcov[1,2] <- simp.hubercor(x,y, a) robcov[2,1] <- robcov[1,2] xy <- cbind(x, y) n <- nrow(xy) d <- qchisq(p, 2) invcov<-solve(robcov) D <- rowSums((xy %*% invcov) * xy) #D <- diag(xy %*% solve(robcov) %*% t(xy)) ind <- D>d xy[ind,] <- xy[ind,]*sqrt(d/D[ind]) x <- xy[, 1] y <- xy[, 2] cor(x,y) } simp.hubercor<-function(x, y, a=2) { indx<-abs(x)>a x[indx]<-a*sign(x[indx]) indy<-abs(y)>a y[indy]<-a*sign(y[indy]) n <- length(x) aa <- x*y ind <- aa < 0 n1 <- sum(ind) n2 <- n - n1 if(n1 <= n2) { a1 <- a*sqrt(n1/n2) indx<-abs(x[ind])>a1 x[ind][indx] <- a1*sign(x[ind][indx]) indy<-abs(y[ind])>a1 y[ind][indy] <- a1*sign(y[ind][indy]) } else { ind <- aa > 0 a1 <- a*sqrt(n2/n1) indx<-abs(x[ind])>a1 x[ind][indx] <- a1*sign(x[ind][indx]) indy<-abs(y[ind])>a1 y[ind][indy] <- a1*sign(y[ind][indy]) } cor(x, y) }