This package provides functions for generating all possible splits of variables into groups, and computing the best split selection regression estimator for low-dimensional data.
You can install the stable version on R CRAN.
install.packages("splitSelect", dependencies = TRUE)
You can install the development version from GitHub.
library(devtools)
::install_github("AnthonyChristidis/splitSelect") devtools
Here is some code to generate all possible splits of variables into groups.
# Loading library
library(splitSelect)
# Setting number of variables and groups
<- 8
p <- 4
G <- TRUE
use.all
# Generate the number of partitions
<- generate_partitions(p, G, use.all=use.all)
my.partitions
my.partitions
# Generate the number of splits
nsplit(p, G, use.all=use.all)
# Generate the number of splits (fixed partition)
nsplit(p, G, use.all=use.all,
fix.partition=matrix(c(2,2,2,2), nrow=1))
# Generate the splits
<- generate_splits(p, G, use.all=use.all)
all.splits head(all.splits)
nrow(all.splits)
# Generate the splits (fixed partition)
<- generate_splits(p, G, use.all=use.all,
all.splits fix.partition=matrix(c(2,2,2,2), nrow=1))
head(all.splits)
nrow(all.splits)
# Generate samples of splits
<- rsplit(10000, p, G, fix.partition=matrix(c(2,2,2,2), nrow=1))
sample.splits sample.splits
Here is some code to apply to compute the best split selection estimator for simulated data with spurious correlation in the training set.
# Download the packages
install.packages("simTargetCov")
install.packages("glmnet")
install.packages("SplitReg")
# Setting the parameters
<- 6
p <- 30
n <- 5000
n.test <- 5
group.beta <- c(rep(1, 2), rep(group.beta, p-2))
beta <- 0.1
rho <- 0.9
r <- 3
SNR # Creating the target matrix with "kernel" set to rho
<- function(r, p){
target_cor <- diag(p)
Gamma for(i in 1:(p-1)){
for(j in (i+1):p){
<- Gamma[j,i] <- r^(abs(i-j))
Gamma[i,j]
}
}return(Gamma)
}# AR Correlation Structure
<- target_cor(r, p)
Sigma.r <- target_cor(rho, p)
Sigma.rho <- as.numeric(sqrt((t(beta) %*% Sigma.rho %*% beta)/SNR))
sigma.epsilon
# Number of cores
<- parallel::detectCores()-1
nb.cores # Registering the clusters
<- parallel::makeCluster(nb.cores)
cl ::registerDoParallel(cl)
doParallel
# Set the seed
set.seed(0)
# Simulate some data
<- simTargetCov::simTargetCov(n=n, p=p, target=Sigma.r)
x.train <- 1 + x.train %*% beta + rnorm(n=n, mean=0, sd=sigma.epsilon)
y.train <- mvnfast::rmvn(n.test, mu=rep(0,p), sigma=Sigma.rho)
x.test <- 1 + x.test %*% beta + rnorm(n.test, sd=sigma.epsilon)
y.test
# Best Split Selection for Regression
system.time(
<- cv.splitSelect(x.train, y.train, G=2, use.all=TRUE,
split.out fix.partition=list(matrix(c(2,4,
3,3), ncol=2, byrow=TRUE)), fix.split=NULL,
intercept=TRUE, group.model="glmnet", alpha=0, nfolds=10,
parallel=TRUE, cores=nb.cores)
)<- predict(split.out, newx=x.test)
split.predictions mean((split.predictions-y.test)^2)/sigma.epsilon^2
# Ending the cluster
::stopCluster(cl)
parallel
# Ridge Regression
<- glmnet::cv.glmnet(x.train, y.train, alpha=0)
cv.ridge <- glmnet::glmnet(x.train, y.train, alpha=0, lambda=cv.ridge$lambda.min)
ridge <- predict(ridge, newx=x.test)
ridge.predictions mean((ridge.predictions-y.test)^2)/sigma.epsilon^2
# Lasso
<- glmnet::cv.glmnet(x.train, y.train, alpha=1)
cv.lasso <- glmnet::glmnet(x.train, y.train, alpha=1, lambda=cv.lasso$lambda.min)
lasso <- predict(lasso, newx=x.test)
lasso.predictions mean((lasso.predictions-y.test)^2)/sigma.epsilon^2
# Elastic Net
<- glmnet::cv.glmnet(x.train, y.train, alpha=3/4)
cv.elastic <- glmnet::glmnet(x.train, y.train, alpha=3/4, lambda=cv.elastic$lambda.min)
elastic <- predict(elastic, newx=x.test)
elastic.predictions mean((elastic.predictions-y.test)^2)/sigma.epsilon^2
# SplitReg
<- SplitReg::cv.SplitReg(x.train, y.train, num_models=3, alpha=1e-2)
cv.splitreg <- predict(cv.splitreg, newx=x.test)
splitreg.predictions mean((splitreg.predictions-y.test)^2)/sigma.epsilon^2
# Looking at the MSPEs for all the possible splits (out-of-sample)
<-
split.mspes sapply(1:nrow(split.out$splits), function(x, n.test, x.test, split.out, y.test)
mean((y.test-cbind(rep(1, n.test), x.test) %*% split.out$betas[,x])^2)},
{/sigma.epsilon^2
n.test, x.test, split.out, y.test)# Minimum MSPE for the splits (out-of-sample)
min(split.mspes)
$optimal.split]
split.mspes[split.out# Optimal splits comparison (out-of-sample)
$splits[which.min(split.mspes),]
split.out$optimal.split.var
split.out# Optimal betas comparison (out-of-sample)
$betas[,which.min(split.mspes), drop=FALSE]
split.outcoef(split.out)
This package is free and open source software, licensed under GPL (>= 2).