This script requires that the working directory includes the folders “data”, “results”, and “manuscript”, with the files “PPMI_Baseline_Data_02Jul2018.csv”, “PPMI_Year_1-3_Data_02Jul2018.csv” and “PPMI_Data_Dictionary_for_Baseline_Dataset_Jul2018.csv” in the folder “data”. We obtained our results using R 4.3.0 (2023-04-21) with cornet 0.0.8 (2023-06-01) on a local machine (aarch64-apple-darwin20, macOS Ventura 13.4).
# features
X <- read.csv("data/PPMI_Baseline_Data_02Jul2018.csv",row.names="PATNO",na.strings=c(".",""))
X <- X[X$APPRDX==1,] # Parkinson's disease
X[c("SITE","APPRDX","EVENT_ID","symptom5_comment")] <- NULL
100*mean(is.na(X)) # proportion missingness
#x <- mice::complete(data=mice::mice(X,m=10,maxit=5,method="pmm",seed=1),action="all") # low-dimensional
x <- lapply(seq_len(10),function(x) missRanger::missRanger(data=X,pmm.k=3,
num.trees=100,verbose=0,seed=1)) # high-dimensional
x <- lapply(x,function(x) model.matrix(~.-1,data=x))
# outcome
Y <- read.csv("data/PPMI_Year_1-3_Data_02Jul2018.csv",na.strings=".")
Y <- Y[Y$APPRDX==1 & Y$EVENT_ID %in% c("V04","V06","V08"),]
Y <- Y[,c("EVENT_ID","PATNO","moca")]
Y <- reshape(Y,idvar="PATNO",timevar="EVENT_ID",direction="wide")
rownames(Y) <- Y$PATNO; Y$PATNO <- NULL
# overlap
names <- intersect(rownames(X),rownames(Y))
Y <- Y[names,]; x <- lapply(x,function(x) x[names,])
save(Y,x,file="data/processed_data.RData")
writeLines(text=capture.output(utils::sessionInfo(),cat("\n"),
sessioninfo::session_info()),con="data/info_data.txt")
# dictionary
rm(list=ls())
info <- read.csv("data/PPMI_Data_Dictionary_for_Baseline_Dataset_Jul2018.csv",nrows=238)
info <- info[!info$Variable %in% c("","PATNO","SITE","APPRDX","EVENT_ID","symptom5_comment"),]
info <- paste(paste0("\\item \\textit{",info$Variable,"}: ",info$Description),collapse=" ")
info <- gsub(pattern=" \\(OFF\\)",replacement="",x=info)
info <- gsub(pattern="_",replacement="\\_",x=info,fixed=TRUE)
info <- gsub(pattern="&",replacement="\\\\&",x=info)
cat(info)
load("data/processed_data.RData",verbose=TRUE)
colSums(!is.na(Y)) # sample size
round(100*colMeans(Y<25.5,na.rm=TRUE),1) # proportion impairment
# --- Cross-validating models. ---
names <- paste0(rep(c("lasso","ridge"),each=3),seq_len(3)) # change to 3!
loss <- fit <- p.log <- p.lin <- list()
for(i in seq_along(x)){
loss[[i]] <- fit[[i]] <- p.log[[i]] <- p.lin[[i]] <- list()
cat("i =",i,"\n")
for(j in seq_along(names)){
cat("j =",j," ")
alpha <- 1*(substr(names[j],start=1,stop=5)=="lasso")
index <- as.numeric(substr(names[j],start=6,stop=6))
y <- Y[,index]
cond <- !is.na(y)
set.seed(i)
loss[[i]][[names[j]]] <- cornet::cv.cornet(y=y[cond],cutoff=25.5,
X=x[[i]][cond,],alpha=alpha,rf=(alpha==1),xgboost=(alpha==1))
set.seed(i)
fit[[i]][[names[j]]] <- cornet::cornet(y=y[cond],cutoff=25.5,
X=x[[i]][cond,],alpha=alpha)
set.seed(i)
temp <- replicate(n=50,expr=cornet:::.test(y=y[cond],cutoff=25.5,X=x[[i]][cond,],alpha=alpha))
p.log[[i]][[names[j]]] <- median(unlist(temp["log",]))
p.lin[[i]][[names[j]]] <- median(unlist(temp["lin",]))
}
cat("\n")
}
save(loss,fit,p.log,p.lin,file="results/application.RData")
writeLines(text=capture.output(utils::sessionInfo(),cat("\n"),
sessioninfo::session_info()),con="results/info_app.txt")
load("results/application.RData",verbose=TRUE)
k <- "binomial" # compare: k <- "gaussian
names <- c(paste0("lasso",1:3),paste0("ridge",1:3))
frame <- data.frame(row.names=names)
for(i in names){
# deviance
dev <- as.data.frame(t(sapply(loss,function(x) x[[i]]$deviance)))
dec <- (dev$combined-dev[[k]])/dev[[k]]
# change in percent
frame[i,"dev.min"] <- round(100*min(dec),digits=1)
frame[i,"dev.max"] <- round(100*max(dec),digits=1)
# proportion with improvement
frame[i,"dev.num"] <- sum(dev$combined < dev[[k]])
# significance based on multi-split
if(k=="binomial"){
pvalue <- sapply(p.log,function(x) x[[i]])
}
if(k=="gaussian"){
pvalue <- sapply(p.lin,function(x) x[[i]])
}
frame[i,"pval.min"] <- round(min(pvalue),digits=3)
frame[i,"pval.max"] <- round(max(pvalue),digits=3)
# quantiles weight parameter
q <- round(quantile(sapply(fit,function(x) x[[i]]$pi.min),probs=c(0,0.5,1)),digits=2)
frame[i,"pi.min"] <- q[1]
frame[i,"pi.med"] <- q[2]
frame[i,"pi.max"] <- q[3]
# quantiles scale parameter
q <- round(quantile(sapply(fit,function(x) x[[i]]$sigma.min),probs=c(0,0.5,1)),digits=2)
frame[i,"sd.min"] <- q[1]
frame[i,"sd.med"] <- q[2]
frame[i,"sd.max"] <- q[3]
}
# presentation
frame <- format(frame)
rownames(frame) <- paste0(substr(x=rownames(frame),start=1,stop=5)," ",
substr(x=rownames(frame),start=6,stop=6))
colnames(frame) <- gsub(pattern="dev.",replacement="\\\\delta_{\\\\text{",x=colnames(frame))
colnames(frame) <- gsub(pattern="pval.",replacement="p_{\\\\text{",x=colnames(frame))
colnames(frame) <- gsub(pattern="pi.",replacement="\\\\pi_{\\\\text{",x=colnames(frame))
colnames(frame) <- gsub(pattern="sd.",replacement="\\\\sigma_{\\\\text{",x=colnames(frame))
colnames(frame) <- paste0("$",colnames(frame),"}}$")
xtable <- xtable::xtable(frame,align="r|rrc|cc|ccc|ccc")
xtable::print.xtable(xtable,include.rownames=TRUE,sanitize.text.function=function(x) x,comment=FALSE)
load("results/application.RData",verbose=TRUE)
names <- c(paste0("lasso",1:3),paste0("ridge",1:3))
type <- c("deviance","class","mse","mae","auc","prauc")
k <- "binomial"
frame <- matrix(NA,nrow=length(names),ncol=length(type),dimnames=list(names,type))
for(i in names){
for(j in type){
value <- as.data.frame(t(sapply(loss,function(x) x[[i]][[j]])))
change <- 100*(value$combined-value[[k]])/value[[k]]
frame[i,j] <- median(change)
}
}
frame <- format(round(frame,digits=1))
frame <- gsub(pattern=" ",replacement="+",x=frame)
colnames(frame) <- sapply(colnames(frame),function(x) switch(x,class="\\textsc{mcr}",mse="\\textsc{mse}",mae="\\textsc{mae}",auc="\\textsc{roc-auc}",prauc="\\textsc{pr-auc}",x))
colnames(frame) <- paste0("$\\Delta$",colnames(frame))
rownames(frame) <- paste0(substr(x=rownames(frame),start=1,stop=5)," ",
substr(x=rownames(frame),start=6,stop=6))
xtable <- xtable::xtable(frame,align="r|cccccc")
xtable::print.xtable(xtable,include.rownames=TRUE,sanitize.text.function=function(x) x,comment=FALSE)
load("results/application.RData",verbose=TRUE)
table <- numeric()
for(i in 1:3){
lasso <- apply(sapply(loss,function(x) x[[paste0("lasso",i)]]$deviance),1,median)
names(lasso)[names(lasso)=="binomial"] <- "logistic lasso regression"
names(lasso)[names(lasso)=="combined"] <- "combined lasso regression"
ridge <- apply(sapply(loss,function(x) x[[paste0("ridge",i)]]$deviance),1,median)
names(ridge)[names(ridge)=="binomial"] <- "logistic ridge regression"
names(ridge)[names(ridge)=="combined"] <- "combined ridge regression"
table <- cbind(table,c(lasso[c("logistic lasso regression","combined lasso regression")],ridge[c("logistic ridge regression","combined ridge regression")],lasso["rf"],lasso["xgboost"]))
rownames(table)[rownames(table)=="rf"] <- "\\texttt{randomForest}"
rownames(table)[rownames(table)=="xgboost"] <- "\\texttt{xgboost}"
}
colnames(table) <- paste("year",1:3)
rownames(table) <- paste("\\footnotesize",rownames(table))
xtable <- xtable::xtable(table)
xtable::print.xtable(xtable,comment=FALSE,hline.after=c(0,2,4),sanitize.text.function=identity)
load("results/application.RData",verbose=TRUE)
sum <- fit[[1]]$lasso1
sum$cvm <- Reduce("+",lapply(fit,function(x) x$lasso1$cvm))
sum$sigma.min <- sapply(fit,function(x) x$lasso1$sigma.min)
sum$pi.min <- sapply(fit,function(x) x$lasso1$pi.min)
grDevices::pdf("manuscript/figure_MAP.pdf",width=4,height=3)
graphics::par(mar=c(4,4,0.5,0.5))
cornet:::plot.cornet(sum)
grDevices::dev.off()
rm(list=ls())
sigma <- c(1,2,3); cutoff <- 25.5
x <- seq(from=20,to=30,length.out=100)
grDevices::pdf("manuscript/figure_TFN.pdf",width=4,height=3)
graphics::par(mar=c(4,4,0.5,0.5))
graphics::plot.new()
graphics::plot.window(xlim=range(x),ylim=c(0,1))
graphics::box()
graphics::axis(side=1)
graphics::axis(side=2)
graphics::title(xlab=expression(hat(y)),ylab=expression(Phi(hat(y),mu,sigma^2)),line=2.5)
graphics::abline(h=0.5,lty=2,col="grey")
graphics::abline(v=cutoff,lty=2,col="grey")
lty <- c(2,1,3); lwd <- c(1,1,2)
lty <- c("dashed","solid","dotted")
for(i in seq_along(sigma)){
p <- stats::pnorm(q=x,mean=cutoff,sd=sigma[i])
graphics::lines(x=x,y=p,lty=lty[i],lwd=lwd[i])
}
graphics::text(x=cutoff,y=0.40,labels=bquote(mu==.(cutoff)),pos=4)
legend <- sapply(sigma,function(x) as.expression(bquote(sigma == .(x))))
graphics::legend(x="topleft",legend=legend,lty=lty,bty="n",lwd=lwd)
grDevices::dev.off()