If you encounter dependency error for sva
package, install it from Bioconductor:
omicwas
is a package for cell-type-specific disease association testing, using bulk tissue data as input. The package accepts DNA methylation data for epigenome-wide association studies (EWAS), as well as gene expression data for differential gene expression analyses. The main function is ctassoc
See description.
Let’s load a sample data.
data(GSE42861small)
X = GSE42861small$X
W = GSE42861small$W
Y = GSE42861small$Y
Y = Y[seq(1, 20), ] # for brevity
C = GSE42861small$C
See description.
The conventional way is to use ordinary linear regression.
result = ctassoc(X, W, Y, C = C,
test = "full")
#> Linear regression ...
#> Summarizing result ...
result$coefficients
#> # A tibble: 380 x 6
#> response celltype term estimate statistic p.value
#> <chr> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 cg10543797 CD4. disease 0.0116 0.387 6.99e- 1
#> 2 cg10543797 CD4. 1 0.818 54.1 8.77e-241
#> 3 cg10543797 CD8. disease 0.0683 2.21 2.71e- 2
#> 4 cg10543797 CD8. 1 0.798 51.0 1.45e-227
#> 5 cg10543797 mono disease -0.0611 -0.977 3.29e- 1
#> 6 cg10543797 mono 1 0.745 23.5 2.27e- 88
#> 7 cg10543797 Bcells disease 0.0469 0.761 4.47e- 1
#> 8 cg10543797 Bcells 1 0.868 28.2 4.48e-114
#> 9 cg10543797 NK disease -0.0272 -0.764 4.45e- 1
#> 10 cg10543797 NK 1 0.817 43.6 7.22e-194
#> # … with 370 more rows
We recommend nonlinear regression with ridge regularization. For DNA methylation, we use the logit function for normalization, and the test option is nls.logit
result = ctassoc(X, W, Y, C = C,
test = "nls.logit",
regularize = TRUE)
#> nls.logit ...
#>
|
| | 0%
|
|======================================================================| 100%
#> Summarizing result ...
print(result$coefficients, n = 20)
#> # A tibble: 380 x 6
#> response celltype term estimate statistic p.value
#> <chr> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 cg10543797 CD4. 1 1.50 15.5 1.24e- 46
#> 2 cg10543797 CD8. 1 1.33 14.5 9.53e- 42
#> 3 cg10543797 mono 1 1.19 6.93 1.03e- 11
#> 4 cg10543797 Bcells 1 1.81 7.29 9.12e- 13
#> 5 cg10543797 NK 1 1.63 13.5 8.63e- 37
#> 6 cg10543797 Nue 1 1.35 56.3 1.32e-253
#> 7 cg10543797 Eos 1 0.996 5.84 8.30e- 9
#> 8 cg10543797 CD4. disease 0.000711 0.465 6.42e- 1
#> 9 cg10543797 CD8. disease 0.00202 1.43 1.52e- 1
#> 10 cg10543797 mono disease -0.000917 -0.921 3.57e- 1
#> 11 cg10543797 Bcells disease 0.000350 0.505 6.14e- 1
#> 12 cg10543797 NK disease -0.000504 -0.468 6.40e- 1
#> 13 cg10543797 Nue disease -0.00555 -0.855 3.93e- 1
#> 14 cg10543797 Eos disease 0.000278 0.474 6.35e- 1
#> 15 cg10543797 <NA> sexF -0.0415 -3.94 9.16e- 5
#> 16 cg10543797 <NA> age -0.000303 -0.746 4.56e- 1
#> 17 cg10543797 <NA> smoking_current -0.00532 -0.430 6.67e- 1
#> 18 cg10543797 <NA> smoking_ex -0.0192 -1.58 1.13e- 1
#> 19 cg10543797 <NA> smoking_occasional 0.00109 0.0627 9.50e- 1
#> 20 cg22718169 CD4. 1 2.18 15.5 2.75e- 46
#> # … with 360 more rows
The first 19 lines show the result for CpG site cg10543797. Line 1 shows that the basal methylation level in CD4. (actually CD4+ T cells) is plogis(1.498) = 0.817, so this cell type is 81% methylated. Line 8 shows that the CD4.-specific effect of the disease is 7.10e-04 (in logit scale). Since the p.value is 0.64, this is not significant. Line 15 shows that the effect of sexF (female compared to male) is -4.14e-02 with a small p.value 9.15e-05. Since sexF is a covariate that has uniform effect across cell types, the celltype column is NA.
Let’s load a sample data.
data(GTExsmall)
X = GTExsmall$X
W = GTExsmall$W
Y = GTExsmall$Y + 1
Y = Y[seq(1, 20), ] # for brevity
C = GTExsmall$C
See description.
The conventional way is to use ordinary linear regression.
result = ctassoc(X, W, Y, C = C,
test = "full")
#> Linear regression ...
#> Summarizing result ...
result$coefficients
#> # A tibble: 260 x 6
#> response celltype term estimate statistic p.value
#> <chr> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 ENSG00000059804 Granulocytes age 45611. 2.56 1.07e- 2
#> 2 ENSG00000059804 Granulocytes 1 378100. 1.64 1.02e- 1
#> 3 ENSG00000059804 B.cells..CD19.. age 143773. 0.683 4.95e- 1
#> 4 ENSG00000059804 B.cells..CD19.. 1 -774328. -0.324 7.46e- 1
#> 5 ENSG00000059804 CD4..T.cells age -104302. -2.42 1.59e- 2
#> 6 ENSG00000059804 CD4..T.cells 1 -1043816. -2.28 2.30e- 2
#> 7 ENSG00000059804 CD8..T.cells age 94351. 1.42 1.57e- 1
#> 8 ENSG00000059804 CD8..T.cells 1 2577296. 3.49 5.47e- 4
#> 9 ENSG00000059804 NK.cells..CD3..CD56.. age 134268. 2.34 1.96e- 2
#> 10 ENSG00000059804 NK.cells..CD3..CD56.. 1 5027033. 6.61 1.33e-10
#> # … with 250 more rows
We recommend nonlinear regression with ridge regularization. For DNA methylation, we use the log function for normalization, and the test option is nls.log
result = ctassoc(X, W, Y, C = C,
test = "nls.log",
regularize = TRUE)
#> nls.log ...
#>
|
| | 0%
|
|======================================================================| 100%
#> Summarizing result ...
print(result$coefficients, n = 15)
#> # A tibble: 260 x 6
#> response celltype term estimate statistic p.value
#> <chr> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 ENSG00000059804 Granulocytes 1 8.85 0.544 5.87e- 1
#> 2 ENSG00000059804 B.cells..CD19.. 1 8.85 0.0490 9.61e- 1
#> 3 ENSG00000059804 CD4..T.cells 1 8.85 0.252 8.01e- 1
#> 4 ENSG00000059804 CD8..T.cells 1 10.7 1.21 2.25e- 1
#> 5 ENSG00000059804 NK.cells..CD3..CD56.. 1 14.2 52.7 2.61e-179
#> 6 ENSG00000059804 Monocytes..CD14.. 1 8.85 0.0666 9.47e- 1
#> 7 ENSG00000059804 Granulocytes age 0.000462 0.221 8.25e- 1
#> 8 ENSG00000059804 B.cells..CD19.. age 0.0000111 0.107 9.15e- 1
#> 9 ENSG00000059804 CD4..T.cells age -0.0000308 -0.0401 9.68e- 1
#> 10 ENSG00000059804 CD8..T.cells age 0.000655 0.312 7.55e- 1
#> 11 ENSG00000059804 NK.cells..CD3..CD56.. age 0.0185 4.62 5.20e- 6
#> 12 ENSG00000059804 Monocytes..CD14.. age 0.0000263 0.0955 9.24e- 1
#> 13 ENSG00000059804 <NA> sexF -0.00257 -0.0273 9.78e- 1
#> 14 ENSG00000147454 Granulocytes 1 10.1 5.20 3.20e- 7
#> 15 ENSG00000147454 B.cells..CD19.. 1 9.25 0.197 8.44e- 1
#> # … with 245 more rows
The first 13 lines show the result for transcript ENSG00000059804. Line 1 shows that the basal expression level in Granulocytes is exp(8.847) = 6955. Line 7 shows that the Granulocytes-specific effect of age is 4.62e-04 (in log scale). Since the p.value is 0.82, this is not significant. Line 13 shows that the effect of sexF (female compared to male) is -2.57e-03 with p.value 0.97 Since sexF is a covariate that has uniform effect across cell types, the celltype column is NA.
For QTL analyses, we use ctcisQTL
function instead of ctassoc
. To speed up computation, we perform linear ridge regression, thus the statistical test is almost identical to ctassoc(test = "nls.identity", regularize = TRUE)
. We analyze only in the linear scale. Association analysis is performed between each row of Y and each row of X. See description.
Let’s load a sample data.
data(GSE79262small)
X = GSE79262small$X
Xpos = GSE79262small$Xpos
W = GSE79262small$W
Y = GSE79262small$Y
Ypos = GSE79262small$Ypos
C = GSE79262small$C
X = X[seq(1, 3001, 100), ] # for brevity
Xpos = Xpos[seq(1, 3001, 100)]
Y = Y[seq(1, 501, 100), ]
Ypos = Ypos[seq(1, 501, 100)]
See description.
Analyze mQTL.
ctcisQTL(X, Xpos, W, Y, Ypos, C = C)
#> Writing output to /var/folders/7q/n5cckft14d9345z_tdpy3qqc0000gn/T//RtmpFBI9hb/ctcisQTL.out.txt
#>
|
| | 0%
|
|== | 3%
|
|===== | 6%
|
|======= | 10%
|
|========= | 13%
|
|=========== | 16%
|
|============== | 19%
|
|================ | 23%
|
|================== | 26%
|
|==================== | 29%
|
|======================= | 32%
|
|================================ | 45%
|
|================================== | 48%
|
|==================================== | 52%
|
|====================================== | 55%
|
|========================================= | 58%
|
|=========================================================== | 84%
|
|============================================================= | 87%
|
|=============================================================== | 90%
|
|================================================================= | 94%
|
|==================================================================== | 97%
|
|======================================================================| 100%
The result is stored in a file.
head(
read.table(file.path(tempdir(), "ctcisQTL.out.txt"),
header = TRUE,
sep ="\t"))
#> term response celltype estimate statistic p.value
#> 1 rs6678176 cg19251656 CD4T -0.0031975545 -0.3112062 0.75693773
#> 2 rs6678176 cg19251656 CD8T 0.0029236713 0.4368269 0.66411762
#> 3 rs6678176 cg19251656 NK 0.0021900746 0.6992125 0.48765929
#> 4 rs6678176 cg01433766 CD4T -0.0007853244 -0.4642848 0.64445898
#> 5 rs6678176 cg01433766 CD8T -0.0006336713 -0.5359356 0.59437935
#> 6 rs6678176 cg01433766 NK -0.0012465110 -1.9903623 0.05203197
The first 3 lines show the result for the association of SNP rs6678176 with CpG site cg19251656. Line 1 shows that the CD4T-specific effect of the SNP is -0.003. Since the p.value is 0.75, this is not significant.