Code for reproducing the results shown in the manuscript.
### Loading functions. ###
inst <- rownames(utils::installed.packages())
cran <- c("devtools","R.utils","Matrix","glmnet","pROC","BiocManager","ashr")
# "googledrive", "httpuv"
if(!all(cran %in% inst)){
for(i in seq_along(cran)){
if(!cran[i] %in% inst){
install.packages(cran[i])
}
}
}
bioc <- c("edgeR","TCGAbiolinks")
if(!all(bioc %in% inst)){
#source("http://bioconductor.org/biocLite.R")
for(i in seq_along(bioc)){
if(!bioc[i] %in% inst){
#biocLite(bioc[i])
BiocManager::install(bioc[i])
}
}
}
#if(!"ashr" %in% inst){
# devtools::install_github("stephens999/ashr")
#}
user <- Sys.getenv("USERNAME")
path <- file.path("C:","Users",user,"Desktop","palasso")
if(user=="arra"){path <- "C:/Users/arra/Desktop/MATHS/palasso_desktop"}
if(user==""){path <- "/virdir/Scratch/arauschenberger/palasso"}
setwd(path)
folders <- c("data","results")
invisible(sapply(folders,function(x) if(!dir.exists(x)){dir.create(x)}))
if(user!="arra"){
devtools::install_github("kkdey/CorShrink") # ref="a9f6ba0"
devtools::install_github("rauschenberger/palasso") # ref="4a995a2"
}
if(FALSE){
# The functions <<save>>, <<file.exists>> and <<file.remove>> access the hard disk, but also try to access googledrive.
save <- function(object,file){
base::save(object,file=file)
tryCatch(expr=googledrive::drive_upload(media=file,path=file),
error=function(x) NULL)
#Sys.sleep(0.5)
}
file.exists <- function(file){
offline <- base::file.exists(file)
online <- FALSE
if(!offline){
d <- googledrive::as_dribble(x=file)
online <- tryCatch(expr=googledrive::some_files(d),
error=function(x) FALSE)
#Sys.sleep(0.5)
}
return(offline|online)
}
file.remove <- function(file){
base::file.remove(file)
tryCatch(expr=googledrive::drive_trash(file=file),
error=function(x) NULL)
#Sys.sleep(0.5)
}
# The function <<update>> moves files from the research cloud to the remote drive. Given both paths, it first verifies whether the folders SIM, GDC and CCA are available, and then copies all missing files from the research cloud to the remote drive.
# from: path to the origin
# to: path to the destination
update <- function(from,to){
dir <- c("SIM","GDC","CCA")
if(any(!dir.exists(file.path(from,dir)))){stop("Invalid.")}
if(any(!dir.exists(file.path(to,dir)))){stop("Invalid.")}
pb <- utils::txtProgressBar(min=0,max=1,style=3)
for(i in seq_along(dir)){
files0 <- dir(file.path(from,dir[i]))
files1 <- dir(file.path(to,dir[i]))
names <- files0[!files0 %in% files1]
for(j in seq_along(names)){
utils::setTxtProgressBar(pb=pb,value=(i-1)/3+(j*i)/(3*length(names)))
file.copy(from=file.path(from,dir[i],names[j]),
to=file.path(to,dir[i],names[j]),
copy.date=TRUE)
}
utils::setTxtProgressBar(pb=pb,value=i/3)
}
}
# update(from="results",to="//tsclient/N/palasso/results")
}
Last data download started on 2019-03-26 (R version 3.5.3).
### Downloading "Isoform Expression Quantification". ###
#rm(list=ls())
#<<functions>>
directory <- file.path(path,"data")
setwd(directory)
# Retrieving cancer types:
project <- TCGAbiolinks::getGDCprojects()$id
project <- project[grepl(x=project,pattern="TCGA")]
# Downloading isoform expression data:
y <- X <- list()
for(i in seq_along(project)){
query <- TCGAbiolinks::GDCquery(project=project[i],
data.category="Transcriptome Profiling",
data.type="Isoform Expression Quantification")
TCGAbiolinks::GDCdownload(query,method="api",directory=directory)
trace(TCGAbiolinks:::readTranscriptomeProfiling,tracer=quote(ignore.case<-TRUE))
X[[i]] <- TCGAbiolinks::GDCprepare(query,directory=directory)
X[[i]][,c("miRNA_ID","reads_per_million_miRNA_mapped",
"cross-mapped","miRNA_region")] <- NULL
y[[i]] <- rep(project[i],times=length(unique(X[[i]]$barcode)))
}
save(list=c("y","X"),file=file.path(path,"data","isoform_raw.RData"))
load(file.path(path,"data","isoform_raw.RData"),verbose=TRUE)
# Merging isoform expression data:
Xs <- do.call(what=rbind,args=X) # sparse matrix
y <- do.call(what="c",args=y)
# Transform to matrix
Xs$isoform_coords <- gsub(pattern="hg38:chr",replacement="",x=Xs$isoform_coords)
samples <- unique(Xs$barcode)
covariates <- unique(Xs$isoform_coords)
row <- match(Xs$barcode,samples)
col <- match(Xs$isoform_coords,covariates)
X <- Matrix::sparseMatrix(i=row,j=col,x=Xs$read_count,dimnames=list(samples,covariates))
# Order by molecular location
split <- strsplit(x=colnames(X),split=":|-")
chr <- sapply(split,function(x) x[[1]])
pos <- sapply(split,function(x) x[[2]])
order <- order(chr,pos)
X <- X[,order]
if(FALSE){ # testing
i <- sample(seq_len(nrow(Xs)),size=1)
Xs$read_count[i]
X[Xs$barcode[i],Xs$isoform_coords[i]]
}
save(list=c("y","X"),file=file.path(path,"data","isoform_all.RData"))
### Downloading "miRNA Expression Quantification". ###
#rm(list=ls())
#<<functions>>
directory <- file.path(path,"data")
setwd(directory)
# Downloading data.
project <- TCGAbiolinks::getGDCprojects()$id
project <- project[grepl(x=project,pattern="TCGA")]
y <- X <- list()
for(i in seq_along(project)){
query <- TCGAbiolinks::GDCquery(project=project[i],
data.category="Transcriptome Profiling",
data.type="miRNA Expression Quantification")
TCGAbiolinks::GDCdownload(query,method="api",directory=directory)
trace(TCGAbiolinks:::readTranscriptomeProfiling,tracer=quote(ignore.case<-TRUE))
data <- TCGAbiolinks::GDCprepare(query,directory=directory)
X[[i]] <- t(data[,c(seq(from=2,to=ncol(data),by=3))])
y[[i]] <- rep(project[i],times=nrow(X[[i]]))
}
save(list=c("y","X"),file=file.path(path,"data","miRNA_raw.RData"))
load(file.path(path,"data","miRNA_raw.RData"))
X <- do.call(what=rbind,args=X)
y <- do.call(what="c",args=y)
rownames(X) <- gsub(pattern="read_count_",replacement="",x=rownames(X))
save(list=c("y","X"),file=file.path(path,"data","miRNA_all.RData"))
### Downloading "Gene Expression Quantification". ###
#rm(list=ls())
#<<functions>>
directory <- file.path(path,"data")
setwd(directory)
# Retrieving cancer types:
project <- TCGAbiolinks::getGDCprojects()$id
project <- project[grepl(x=project,pattern="TCGA")]
# Downloading data:
memory.limit(size=16000) # Activate virtual memory in system control!
y <- X <- list()
for(i in seq_along(project)){
query <- TCGAbiolinks::GDCquery(project=project[i],
data.category="Transcriptome Profiling",
data.type="Gene Expression Quantification",
workflow.type="HTSeq - Counts"); gc()
TCGAbiolinks::GDCdownload(query=query,method="api",directory=directory); gc()
trace(TCGAbiolinks:::readTranscriptomeProfiling,tracer=quote(ignore.case<-TRUE)); gc()
X[[i]] <- TCGAbiolinks::GDCprepare(query,directory=directory); gc()
y[[i]] <- rep(project[i],times=ncol(X[[i]])); gc()
}
save(list=c("y","X"),file=file.path(path,"data","gene_raw.RData"))
load(file.path(path,"data","gene_raw.RData"))
genes <- SummarizedExperiment::rowData(X[[1]])
mart <- biomaRt::useMart("ensembl",dataset="hsapiens_gene_ensembl") #
char <- biomaRt::getBM(attributes=c("ensembl_gene_id","chromosome_name","transcript_start","gene_biotype"),filters=c("biotype","chromosome_name"),values=list("protein_coding",c(1:22,"X")),mart=mart)
select <- genes$ensembl_gene_id[genes$ensembl_gene_id %in% char$ensembl_gene_id]
X <- lapply(X,function(x) t(SummarizedExperiment::assays(x)$"HTSeq - Counts"[select,]))
X <- do.call(what=rbind,args=X)
y <- do.call(what="c",args=y)
save(list=c("y","X"),file=file.path(path,"data","gene_all.RData"))
### Downloading "Copy Number Variation". ###
#rm(list=ls())
#<<functions>>
directory <- file.path(path,"data")
setwd(directory)
project <- TCGAbiolinks::getGDCprojects()$id
project <- project[grepl(x=project,pattern="TCGA")]
y <- X <- list()
for(i in seq_along(project)){
query <- TCGAbiolinks::GDCquery(project=project[i],
data.category="Copy Number Variation",
data.type="Masked Copy Number Segment")
TCGAbiolinks::GDCdownload(query=query,method="api",directory=directory)
trace(TCGAbiolinks:::readTranscriptomeProfiling,tracer=quote(ignore.case<-TRUE))
X[[i]] <- TCGAbiolinks::GDCprepare(query,directory=directory)
y[[i]] <- rep(project[i],times=length(unique(X[[i]]$Sample))) # correct?
}
save(list=c("y","X"),file=file.path(path,"data","CNV_raw.RData"))
load(file.path(path,"data","CNV_raw.RData"),verbose=TRUE)
# Merging CNV data:
Xs <- do.call(what=rbind,args=X) # sparse matrix
y <- do.call(what="c",args=y)
#table(Xs$Sample)
# Prepare cutting.
cut <- list()
cut$chr <- c(1:22,"X")
cut$start <- sapply(cut$chr,function(x) min(Xs$Start[Xs$Chromosome==x]))
cut$end <- sapply(cut$chr,function(x) max(Xs$End[Xs$Chromosome==x]))
cut$length <- cut$end-cut$start
cut$dist <- sum(cut$length)/10000
cut$num <- round(cut$length/cut$dist)
# Create covariates.
cov <- list()
cov$p <- sum(cut$num)
cov$chromosome <- unlist(sapply(cut$chr,function(i) rep(i,times=cut$num[i])))
cov$location <- unlist(sapply(cut$chr,function(i) round(seq(from=cut$start[i],to=cut$end[i],length.out=cut$num[i]))))
cov$name <- paste0(cov$chromosome,":",cov$location)
# Create indices for each covariate.
index <- rep(list(integer()),times=cov$p)
pb <- utils::txtProgressBar(min=0,max=cov$p,style=3)
for(j in seq_len(cov$p)){
utils::setTxtProgressBar(pb=pb,value=j)
index[[j]] <- which((Xs$Chromosome==cov$chromosome[j]) &
(Xs$Start<=cov$location[j]) & (cov$location[j]<=Xs$End)) # consider <
}
# Expand indices to matrix.
X <- matrix(0,nrow=length(unique(Xs$Sample)),ncol=cov$p,
dimnames=list(unique(Xs$Sample),cov$name))
for(j in seq_along(index)){
mean <- Xs$Segment_Mean[index[[j]]]
i <- Xs$Sample[index[[j]]]
X[i,j] <- mean
}
if(FALSE){ # test
sample <- sample(rownames(X),size=1)
covariate <- sample(colnames(X),size=1)
split <- strsplit(covariate,split=":")[[1]]
a <- X[sample,covariate]
b <- Xs$Segment_Mean[(Xs$Sample==sample) & (Xs$Chromosome==split[1]) & (Xs$Start<=as.numeric(split[2])) & (as.numeric(split[2]) < Xs$End)]
all(a==b)
}
save(list=c("y","X","index"),file=file.path(path,"data","CNV_all.RData"))
### Extracting samples of interest. ###
#rm(list=ls())
#<<functions>>
type <- c("isoform","miRNA","CNV","gene")
for(i in seq_along(type)){
cat(type[i],"\n")
load(file.path(path,"data",paste0(type[i],"_all.RData")),verbose=TRUE)
# TCGA barcode
barcode <- rownames(X)
code <- sapply(barcode,function(x) strsplit(x,split="-"))
code <- as.data.frame(do.call(what=rbind,args=code))
colnames(code) <- c("project","TSS","participant","sample_vial",
"portion_analyte","plate","center")
code$sample <- substr(code$sample_vial,start=1,stop=2)
code$vial <- substr(code$sample_vial,start=3,stop=3)
code$portion <- substr(code$portion_analyte,start=1,stop=2)
code$analyte <- substr(code$portion_analyte,start=3,stop=3)
code$sample_vial <- code$portion_analyte <- NULL
# solid tumour (except blood for LAML)
solid <- (code$sample=="01" | (y=="TCGA-LAML" & code$sample=="03"))
X <- X[solid,]
y <- y[solid]
# unique samples
unique <- !duplicated(substr(rownames(X),start=1,stop=12))
X <- X[unique,]
y <- y[unique]
save(list=c("y","X"),file=file.path(path,"data",paste0(type[i],"_sub.RData")))
}
# isoform: n=9'794, p=194'595, k=32
# miRNA: n=9'794, p=1'881, k=32
# gene: n=9'830, p=19'602, k=33
# CNV: n=10'578, p=10'000, k=33
## Understanding barcodes:
# overview: https://wiki.nci.nih.gov/display/TCGA/TCGA+barcode
# details: https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables
## Understanding replicate samples:
# http://gdac.broadinstitute.org/runs/sampleReports/latest/READ_Replicate_Samples.html
Last data analysis started on 2019-04-12 (R version 3.5.3).
### Analysing the TCGA data. ###
#rm(list=ls())
#<<functions>>
for(type in c("miRNA","isoform","CNV","gene")){
library(Matrix)
for(k in c(207,sample(528))){
set.seed(k); cat(k," ")
if(type %in% c("isoform","miRNA") & k > 496){next}
if(type %in% c("CNV","gene") & k > 528){next}
# searching for missing cancer-cancer combinations
rm(list=setdiff(ls(),c("k","type","path","save","file.exists","file.remove"))); gc()
file0 <- paste0("results/",type,"_start_",k,".RData")
file1 <- paste0("results/",type,"_loss_",k,".RData")
if(file.exists(file0)||file.exists(file1)){next}
save(object=k,file=file0)
load(paste0("data/",type,"_sub.RData"))
# indicating the cancer-cancer combination
cancer <- substring(text=unique(y),first=6)
comb <- utils::combn(x=cancer,m=2)
select <- paste0("TCGA-",comb[,k])
y <- ifelse(y==select[1],1,ifelse(y==select[2],0,NA))
rm(cancer,select)
# removing other cancer types
cond <- !is.na(y)
y <- y[cond]
X <- X[cond,]
rm(cond)
# pre-processing
if(type %in% c("isoform","miRNA")){
x <- palasso:::.prepare(X,cutoff="zero")
} else if(type=="gene"){
x <- palasso:::.prepare(X,cutoff="knee")
} else if(type=="CNV"){
x <- list(X=X,Z=sign(X))
x <- lapply(x,function(x) scale(x))
attributes(x)$info <- data.frame(n=nrow(X),p=ncol(X),prop=mean(x$Z==0))
}
rm(X)
# cross-validation
loss <- tryCatch(expr=palasso:::.predict(y=y,X=x,nfolds.int=10),error=function(e) palasso:::.predict(y=y,X=x,nfolds.int=10))
# information
loss$info <- cbind(k=k,
y0=comb[2,k],
y1=comb[1,k],
n0=sum(y==0),
n1=sum(y==1),
attributes(x)$info,
loss$info)
# refit
object <- palasso::palasso(y=y,X=x,nfolds=10,family="binomial",standard=TRUE,elastic=TRUE,shrink=TRUE)
model <- c(names(object),"elastic",
paste0("paired.",c("adaptive","standard","combined")))
for(max in c(10,50,Inf)){
temp <- list()
temp$nzero <- data.frame(model=model,x=NA,z=NA)
for(i in seq_along(model)){
coef <- palasso:::coef.palasso(object=object,model=model[i],max=max)
temp$nzero$x[i] <- sum(coef$x!=0)
temp$nzero$z[i] <- sum(coef$z!=0)
}
temp$select <- palasso:::subset.palasso(x=object,max=max,
model="paired.adaptive")$palasso$select
temp$weights <- palasso:::weights.palasso(object=object,max=max,
model="paired.adaptive")
temp$coef <- palasso:::coef.palasso(object=object,max=max,
model="paired.adaptive")
loss[[paste0("fit",max)]] <- temp
}
save(object=loss,file=file1)
file.remove(file0)
}
index <- sum(grepl(dir(),pattern="sessionInfo"))
sink(paste0("sessionInfo",index+1,".txt"))
date()
utils::sessionInfo()
devtools::session_info()
sink()
}
# The function <<collect>> loads all files from PATH including PATTERN in the file name, loads OBJECT into a list, and executes a function call.
#<<functions>>
# path: folder
# pattern: character, or NULL (all files)
# object: character vector, or NULL (all objects)
# what: function call
collect <- function(path=getwd(),pattern="",object=NULL,what="rbind"){
OBJECT = object
# identify files
files <- dir(path)
files <- files[grepl(x=files,pattern=pattern)]
files <- files[grepl(x=files,pattern=".RData")]
number <- gsub(pattern=paste0(pattern,"|.RData"),replacement="",x=files)
files <- files[order(as.numeric(number))] # trial1
names <- gsub(pattern=".RData",replacement="",x=files) # trial1
if(length(files)==0){stop("Invalid datasets.")}
# load data
all <- list()
for(i in seq_along(files)){
x <- load(file.path(path,files[i]))
x <- eval(parse(text=x))
if(is.null(OBJECT)){
all[[i]] <- x
names(all)[i] <- names[i]
} else {
for(j in seq_along(OBJECT)){
all[[OBJECT[j]]][[i]] <- x[[OBJECT[j]]]
names(all[[OBJECT[j]]])[i] <- names[i]
}
}
}
# fuse data
if(is.null(OBJECT)){
all <- do.call(what=what,args=all)
} else {
all <- lapply(all,function(x) do.call(what=what,args=x))
}
return(all)
}
LOSS <- list()
type <- c("gene","isoform","miRNA","CNV")
#type <- "miRNA"
for(i in seq_along(type)){
LOSS[[type[i]]] <- collect(path="results",
pattern=paste0(type[i],"_loss_"),
object=c("deviance","auc","class","info",
paste0("fit",c(10,50,Inf))))
}
for(i in seq_along(LOSS)){
for(j in 1:3){
colnames(LOSS[[i]][[j]])[colnames(LOSS[[i]][[j]])=="paired.adaptive"] <- "paired"
}
}
#type <- "gene"
#a <- LOSS[[type]]$deviance[rownames(LOSS[[type]]$deviance)=="10","paired"]
#b <- LOSS[[type]]$deviance[rownames(LOSS[[type]]$deviance)=="10","elastic"]
#mean(a<b)
### Testing for significant differences. ###
#rm(list=ls())
#<<functions>>
#<<collect>>
row <- c("gene","isoform","miRNA","CNV")
col <- c("10","Inf")
lay <- c("standard_x","standard_z","standard_xz",
"adaptive_x","adaptive_z","adaptive_xz",
"elastic") # added "elastic"
M <- array(NA,dim=c(length(row),length(col),length(lay)),dimnames=list(row,col,lay))
for(i in seq_along(row)){
loss <- LOSS[[row[i]]][c("info","deviance")]
y0 <- as.character(loss$info$y0)
y1 <- as.character(loss$info$y1)
cancer <- sort(unique(c(y0,y1)))
Z <- palasso:::.design(x=cancer)
for(j in seq_along(col)){
# differences
cond <- rownames(loss$deviance)==col[j]
for(k in seq_along(lay)){
fill <- loss$deviance[cond,lay[k]] - loss$deviance[cond,"paired"]
X <- matrix(NA,nrow=length(cancer),ncol=length(cancer),
dimnames=list(cancer,cancer))
X[cbind(y0,y1)] <- X[cbind(y1,y0)] <- fill
X[lower.tri(X)] <- NA
# p-values
pvalue <- rep(NA,times=max(Z))
for(l in seq_len(max(Z))){
x <- as.numeric(X[Z==l])
if(col[j]=="10"){
alternative <- "greater" # Never use "two.sided"!
}
if(col[j]=="Inf"){
alternative <- "less" # Never use "two.sided"!
}
pvalue[l] <- stats::wilcox.test(x=x,alternative=alternative,
exact=FALSE)$p.value
}
# Simes
M[i,j,k] <- palasso:::.combine(pvalue,method="simes")
}
}
}
# Table SIG: significance
constraint <- "10"
table <- format(M[,constraint,1:6],digits=1,scientific=FALSE)
for(i in seq_len(nrow(table))){
for(j in seq_len(ncol(table))){
if(M[i,constraint,j]>=0.05){
table[i,j] <- paste0("{\\color{gray}{",table[i,j],"}}")
}
}
}
one <- c("","\\text{standard}","","","\\text{adaptive}","")
two <- paste0("\\text{",c("x","z","xz","x","z","xz"),"}")
rownames(table) <- paste0("\\text{",rownames(table),"}")
table <- rbind(one,two,table,deparse.level=0)
rownames(table)[1] <- "~"
xtable <- xtable::xtable(table,align=c("r","|","c","c","c","|","c","c","c"))
xtable::print.xtable(xtable,type="latex",include.colnames=FALSE,sanitize.text.function=identity)
### Comparison with elastic net. ###
#rm(list=ls())
#<<functions>>
#<<collect>>
row <- c("gene","isoform","miRNA","CNV")
col <- c("10","50","Inf")
better <- worse <- less <- matrix(NA,nrow=length(row),ncol=length(col),
dimnames=list(row,col))
for(i in seq_along(row)){
for(j in seq_along(col)){
# proportion of improvements (cross-validation)
cond <- rownames(LOSS[[row[i]]]$deviance)==col[j]
loss <- LOSS[[row[i]]]$deviance[cond,c("paired","elastic")]
better[i,j] <- round(mean(loss[,"paired"]<loss[,"elastic"]),digits=2)
worse[i,j] <- round(mean(loss[,"paired"]>loss[,"elastic"]),digits=2)
# average difference in nzero (refitted models)
df_paired <- apply(LOSS[[row[i]]][[paste0("fit",col[j])]],1,function(x) sum(x$nzero[x$nzero[,"model"]=="paired.adaptive",c("x","z")]))
df_elastic <- apply(LOSS[[row[i]]][[paste0("fit",col[j])]],1,function(x) sum(x$nzero[x$nzero[,"model"]=="elastic95",c("x","z")]))
df_diff <- df_elastic-df_paired
less[i,j] <- round(mean(df_diff),digits=2)
#graphics::hist(df_diff,main=paste(row[i],col[j]),xlim=c(-1,1)*max(abs(df_diff)))
}
}
better
worse
less
### Analysing the refitted models. ###
#rm(list=ls())
#<<functions>>
#<<collect>>
# Table SEL: selected model
nzero <- paste0("fit",c(5,10,Inf))
model <- c(paste0("standard_",c("x","z","xz")),
paste0("adaptive_",c("x","z","xz")),
"between_xz","within_xz")
type <- c("gene","isoform","miRNA","CNV")
table <- array(NA,dim=c(length(nzero),length(model),length(type)),
dimnames=list(nzero,model,type))
for(i in seq_along(nzero)){
for(j in seq_along(model)){
for(k in seq_along(type)){
sub <- LOSS[[type[k]]][[nzero[i]]]
table[i,j,k] <- sum(sub[,"select"]==model[j])
}
}
}
colSums(table["fit10",,]) # CHECK WHETHER COMPLETE!
table <- round(prop.table(table["fit10",,],margin=2),digits=2)
table <- t(table)
table <- table[,apply(table,2,function(x) any(x!=0))]
rownames(table) <- paste0("\\text{",rownames(table),"}")
xtable <- xtable::xtable(table,align=c("r","|","c","c","c","c"))
xtable::print.xtable(xtable,type="latex",include.colnames=FALSE,sanitize.text.function=identity)
# selected weights and covariates
type <- c("gene","isoform","miRNA","CNV")
group <- c("x","z")
model <- c(paste0("standard_",c("x","z","xz")),
paste0("adaptive_",c("x","z","xz")),
"paired.adaptive","elastic") # added "elastic" , paste0("elastic",c(100,75,50,25))
weights10 <- weightsInf <- matrix(NA,nrow=length(group),ncol=length(type),
dimnames=list(group,type))
coef10 <- coefInf <- array(NA,dim=c(length(group),length(type),length(model)),
dimnames=list(group,type,model))
for(i in seq_along(group)){
for(j in seq_along(type)){
weights10[,j] <- rowMeans(sapply(LOSS[[type[j]]]$fit10[,"weights"],colMeans))
weightsInf[,j] <- rowMeans(sapply(LOSS[[type[j]]]$fitInf[,"weights"],colMeans))
for(k in seq_along(model)){
coef10[i,j,k] <- mean(sapply(LOSS[[type[j]]]$fit10[,"nzero"], function(x) sum(x[x$model==model[k],group[i]])))
coefInf[i,j,k] <- mean(sapply(LOSS[[type[j]]]$fitInf[,"nzero"], function(x) sum(x[x$model==model[k],group[i]])))
}
}
}
# coef10["x",,]+coef10["z",,]
# with sparsity constraint
round(prop.table(weights10,margin=2),2)
round(prop.table(coef10[,,"paired.adaptive"],margin=2),2)
round(colSums(coef10[,,"paired.adaptive"]),2)
# natural sparsity
round(prop.table(weightsInf,margin=2),2)
round(prop.table(coefInf[,,"paired.adaptive"],margin=2),2)
round(colSums(coefInf[,,"paired.adaptive"]),2)
round(colSums(coefInf[,,"elastic"])/colSums(coefInf[,,"paired.adaptive"]),1) # multiple nzero of elastic net and paired lasso
# Table NSC: number of non-zero coefficients
table <- round(coefInf["x",,]+coefInf["z",,])
colnames(table)[7] <- "paired"
one <- c("","\\text{standard}","","","\\text{adaptive}","","\\text{paired}","\\text{elastic}")
two <- paste0("\\text{",c("x","z","xz","x","z","xz","xz","xz"),"}")
rownames(table) <- paste0("\\text{",rownames(table),"}")
table <- rbind(one,two,table,deparse.level=0)
rownames(table)[1] <- "~"
xtable <- xtable::xtable(table,align=c("r","|","c","c","c","|","c","c","c","|","c","|","c"))
xtable::print.xtable(xtable,type="latex",include.colnames=FALSE,sanitize.text.function=identity)
### FIGURE CSW ###
#rm(list=ls())
#<<functions>>
set.seed(1)
overfit <- TRUE
# simulate
n <- 10
cx <- stats::rbeta(n=n,shape1=0.9,shape2=1)
cz <- stats::rbeta(n=n,shape1=0.4,shape2=0.9)
# collection
x <- list()
y <- list()
# adaptive weights (X only)
x[[1]] <- rep(1,times=n)
if(overfit){x[[1]] <- cx}
y[[1]] <- rep(0,times=n)
# adaptive weights (Z only)
x[[2]] <- rep(0,times=n)
y[[2]] <- rep(1,times=n)
if(overfit){y[[2]] <- cz}
# adaptive weights (X and Z)
x[[3]] <- y[[3]] <- rep(0.5,times=n)
if(overfit){x[[3]] <- cx}
if(overfit){y[[3]] <- cz}
# within-pair weights
x[[4]] <- cx^2/(cx+cz)
y[[4]] <- cz^2/(cx+cz)
# visualisation
graphics::par(mfrow=c(1,4),mar=c(4.5,0.5,0.5,0.5),oma=c(0,2,0,0))
for(i in seq_len(4)){
palasso:::plot_pairs(x=x[[i]],y=y[[i]],lwd=4)
if(i==1){
graphics::mtext(text="covariate pair",side=2,line=1)
}
}
### FIGURE DIA ###
#rm(list=ls())
#<<functions>>
ellipse <- function(x,y,a=0.2,b=0.25,border=NA){
n <- max(c(length(x),length(y)))
if(length(x)==1){x <- rep(x,times=n)}
if(length(y)==1){y <- rep(y,times=n)}
if(length(border)==1){border <- rep(border,times=n)}
for(i in seq_len(n)){
angle <- seq(from=0,to=2*pi,length=100)
xs <- x[i] + a * cos(angle)
ys <- y[i] + b * sin(angle)
graphics::polygon(x=xs,y=ys,col=grey(0.9),border=border[i])
}
}
cancer <- c("ACC","BLCA","BRCA","UVM")
number <- c(80,409,1078,80)
col <- grDevices::rgb(red=0,green=0,blue=sample(seq(from=75,to=255,length.out=length(number))),maxColorValue=255)
lwd <- log(2*number)-2
lwd = pmax(5*number/max(number),1)
x1 <- 1.3; x2 <- 2; x3 <- 3
set.seed(1)
# first layer (bubble)
graphics::plot.new()
graphics::par(mar=c(0,0,0,0))
graphics::plot.window(xlim=c(1.1,3.3),ylim=c(-1.6,1.6))
ellipse(x=x1,y=0)
graphics::text(x=x1,y=0,labels="TCGA",font=2,adj=c(0.5,0))
graphics::text(x=x1,y=0,labels="n=9794",adj=c(0.5,1.2),cex=0.9)
# second layer (colon)
y1 <- seq(from=1,to=-1,length.out=length(cancer)+1)
graphics::text(x=x2,y=y1[length(cancer)],labels="...",font=2,srt=90)
y1 <- y1[-length(cancer)]
# first-second layer (connect)
graphics::segments(x0=x1+0.2,y0=0,x1=x2-0.2,y1=y1,lwd=lwd,col=col)
# second layer (bubble)
ellipse(x=x2,y=y1,a=0.2,b=0.2)
graphics::text(x=x2,y=y1,labels=cancer,font=2,col=col,adj=c(0.5,0))
graphics::text(x=x2,y=y1,labels=paste0("n=",number,""),adj=c(0.5,1.2),cex=0.9)
comb <- utils::combn(x=seq_along(y1),m=2)
# third layer (colon)
y2 <- seq(from=1.5,to=-1.5,length.out=ncol(comb)+1)
graphics::text(x=x3,y=y2[ncol(comb)],labels="...",font=2,srt=90)
y2 <- y2[-ncol(comb)]
# second-third layer (connect)
graphics::segments(x0=2.2,y0=y1[comb[1,]],x1=2.7,y1=y2,lwd=lwd[comb[1,]],col=col[comb[1,]])
graphics::segments(x0=2.2,y0=y1[comb[2,]],x1=2.7,y1=y2,lwd=lwd[comb[2,]],col=col[comb[2,]])
# third layer (bubble)
ellipse(x=x3,y=y2,a=0.3,b=0.22)
graphics::text(x=x3,y=y2,labels=paste0(cancer[comb[1,]]," "),
font=2,col=col[comb[1,]],adj=c(1,0))
graphics::text(x=x3,y=y2,labels=paste0(" ",cancer[comb[2,]]),
font=2,col=col[comb[2,]],adj=c(0,0))
graphics::text(x=x3,y=y2,labels=":",font=2,adj=c(0.5,0))
labels <- apply(comb,2,function(x) sum(number[x]))
labels <- paste0("n=",labels,"")
graphics::text(x=x3,y=y2,labels=labels,adj=c(0.5,1.2),cex=0.9)
### FIGURE CLA ###
#rm(list=ls())
#<<functions>>
graphics::par(mfrow=c(4,2),oma=c(0,0,0,0),mar=c(2.1,3.5,0.5,0.5))
for(type in c("gene","isoform","miRNA","CNV")){
loss <- LOSS[[type]][c("deviance","auc","class")]
choice <- "paired"
loss <- lapply(loss,function(x) x[,c(paste0("standard_",c("x","z","xz")),paste0("adaptive_",c("x","z","xz")),choice)])
for(constraint in c("10")){ # c("5","10","Inf")
# change
sub <- lapply(loss,function(x) x[rownames(x)==constraint,])
palasso:::plot_score(sub$deviance,choice=choice)
change <- sub$deviance[,7]-sub$deviance[,-7]
palasso:::plot_box(change,ylab="change",zero=TRUE,choice=NA)
# info
info <- list()
info$select <- names(which.min(apply(sub$deviance,2,median)[-7]))
info$DEV_paired <- median(sub$deviance[,choice])
info$DEV_select <- median(sub$deviance[,info$select])
info$improve <- mean(sub$deviance[,info$select]>sub$deviance[,choice])
info$AUC_paired <- median(sub$auc[,choice])
info$CLASS_paired <- median(sub$class[,choice])
print(as.data.frame(info)) # important
}
}
### FIGURE DEC ###
#rm(list=ls())
#<<functions>>
#graphics::par(oma=c(1.0,1.0,0,0),mar=c(1.5,3.0,0.5,0.5),mfrow=c(1,1))
graphics::par(oma=c(1.0,1.0,0,0),mar=c(1.5,3.0,0.5,0.5),mfrow=c(2,2))
for(type in c("gene","isoform","miRNA","CNV")){
models <- c(paste0("standard_",c("x","z","xz")),
paste0("adaptive_",c("x","z","xz")),"paired")
constraint <- c("3","4","5","10","15","20","25","50","Inf")
loss <- LOSS[[type]]["deviance"]
loss <- lapply(loss,function(x) x[,models])
table <- matrix(NA,nrow=length(constraint),ncol=length(models),
dimnames=list(constraint,models))
for(i in seq_along(constraint)){
sub <- lapply(loss,function(x) x[rownames(x)==constraint[i],])
table[i,] <- apply(sub$deviance,2,median)
}
# table <- log(table)
graphics::plot.new()
graphics::plot.window(xlim=c(1,length(constraint)),ylim=range(table))
graphics::box()
constraint[constraint=="Inf"] <- "n"
graphics::axis(side=2)
graphics::axis(side=1,at=seq_along(constraint),labels=constraint,tick=FALSE,line=-1)
for(k in c(1,2)){
for(i in seq_along(models)){
lty <- ifelse(i%in%c(1,2,3),3,ifelse(i%in%c(4,5,6),2,1))
col <- ifelse(i==7,"#00007F","#FF3535")
pch <- ifelse(i%in%c(1,4),"x",ifelse(i%in%c(2,5),"z",1))
if(k==1){
graphics::lines(table[,i],col=col,lty=lty,lwd=2)
graphics::points(table[,i],col="white",pch=16,cex=1.2)
} else {
graphics::points(table[,i],col=col,pch=1,font=2)
}
}
}
}
graphics::title(ylab="deviance",line=0.0,outer=TRUE)
graphics::title(xlab="sparsity constraint",ylab="deviance",line=0.0,outer=TRUE)
### FIGURE CNV ###
#rm(list=ls())
#<<functions>>
graphics::par(oma=c(0,0,0,0),mar=c(2.1,3.5,0.5,0.5))
graphics::layout(matrix(c(1,1,2,2),nrow=1))
loss <- LOSS[[type]][c("deviance","auc","class")]
loss <- lapply(loss,function(x) x[rownames(x)=="10",])
model <- c(paste0("standard_",c("x","z","xz")),
paste0("adaptive_",c("x","z","xz")))
diff <- loss$auc[,"paired"]-loss$auc[,model]
palasso:::plot_box(diff,zero=TRUE,invert=TRUE,ylab="change")
diff <- loss$class[,"paired"]-loss$class[,model]
palasso:::plot_box(diff,zero=TRUE,ylab="change")
### FIGURE MAP ###
#rm(list=ls())
#<<functions>>
loss <- LOSS[["CNV"]][c("info","auc")]
cancer <- sort(unique(c(levels(loss$info$y0),levels(loss$info$y1))))
X <- matrix(NA,nrow=length(cancer),ncol=length(cancer),dimnames=list(cancer,cancer))
#Z <- palasso:::.design(x=cancer)
y0 <- as.character(loss$info$y0)
y1 <- as.character(loss$info$y1)
X[cbind(y0,y1)] <- X[cbind(y1,y0)] <- loss$auc[rownames(loss$auc)=="10","paired"]
graphics::par(mar=c(0.5,3.0,3.0,0.5))
dimnames(X) <- lapply(dimnames(X),function(x) paste0(" ",x," "))
palasso:::plot_table(X=X,margin=-1,labels=FALSE,las=2,cex=0.7)
#sort(rowMeans(X,na.rm=TRUE),decreasing=TRUE)[1:2] # keep!
### FIGURE COM ###
# 32 cancer types for isoform and miRNA
# 33 cancer types for gene and CNV
#rm(list=ls())
#<<functions>>
for(type in c("miRNA")){
cancer <- sort(unique(as.character(unlist(LOSS[[type]]$info[,c("y0","y1")]))))
n <- length(cancer)
z <- as.numeric(palasso:::.design(x=n))
x <- rep(seq_len(n),each=n)
y <- rep(seq(from=n,to=1,by=-1),times=n)
pch <- z
pch[pch==0] <- NA
pex <- c(".","O","*","+","o","-","'","x")
# colour
base <- grDevices::colorRampPalette(colors=c('darkblue','blue','red','darkred'))(n)
col <- rep(NA,times=length(z))
col[z==0] <- "white"
for(i in seq_len(n)){
col[z==i] <- base[i]
}
graphics::par(mfrow=c(1,1),mar=c(0,0,2,2))
graphics::plot.new()
graphics::plot.window(xlim=c(1,n),ylim=c(1,n))
graphics::points(x=x[pch<=25],y=y[pch<=25],
pch=pch[pch<=25],col=col[pch<=25],cex=0.9)
graphics::points(x=x[pch>25],y=y[pch>25],
pch=pex[(pch-25)[pch>25]],col=col[pch>25],cex=0.9)
graphics::segments(x0=0,x1=n+1,y0=n+1)
graphics::segments(x0=n+1,y0=n+1,y1=0)
graphics::segments(x0=0,x1=n+1,y0=n+1,y1=0,lty=2)
graphics::mtext(text=cancer,side=3,at=1:n,las=2,cex=0.7)
graphics::mtext(text=cancer,side=4,at=n:1,las=2,cex=0.7)
}