Introduction

This file contains some examples of the functions related to the DDC routine. More specifically, DDC and cellMap will be illustrated.

library("cellWise")
library("gridExtra") # has grid.arrange()

# Default options for DDC:
DDCpars = list(fracNA = 0.5, numDiscrete = 3, precScale = 1e-12,
               cleanNAfirst = "automatic", tolProb = 0.99, 
               corrlim = 0.5, combinRule = "wmean",
               returnBigXimp = FALSE, silent = FALSE,
               nLocScale = 25000, fastDDC = FALSE,
               standType = "1stepM", corrType = "gkwls",
               transFun = "wrap", nbngbrs = 100)

# A small list giving the same results:
DDCpars = list(fastDDC = FALSE)

Example with row and column selection

First we illustrate the selection of columns and rows by DDC. This is actually done by the function checkDataSet which is called by DDC.

i = c(1,2,3,4,5,6,7,8,9) 
name = c("aa","bb","cc","dd","ee","ff","gg","hh","ii") 
logic = c(TRUE, FALSE, TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, FALSE) 
V1 = c(1.3,NaN,4.5,2.7,20.0,4.4,-2.1,1.1,-5)
V2 = c(2.3,NA,5,6,7,8,4,-10,0.5)
V3 = c(2,Inf,3,-4,5,6,7,-2,8)
Vna = c(1,-4,2,NaN,3,-Inf,NA,6,5)
Vdis = c(1,1,2,2,3,3,3,1,2)
V0s = c(1,1.5,2,2,2,2,2,3,2.5) 
datafr = data.frame(i,name,logic,V1,V2,V3,Vna,Vdis,V0s) 
datafr

##   i name logic   V1    V2  V3  Vna Vdis V0s
## 1 1   aa  TRUE  1.3   2.3   2    1    1 1.0
## 2 2   bb FALSE  NaN    NA Inf   -4    1 1.5
## 3 3   cc  TRUE  4.5   5.0   3    2    2 2.0
## 4 4   dd FALSE  2.7   6.0  -4  NaN    2 2.0
## 5 5   ee FALSE 20.0   7.0   5    3    3 2.0
## 6 6   ff  TRUE  4.4   8.0   6 -Inf    3 2.0
## 7 7   gg  TRUE -2.1   4.0   7   NA    3 2.0
## 8 8   hh  TRUE  1.1 -10.0  -2    6    1 3.0
## 9 9   ii FALSE -5.0   0.5   8    5    2 2.5

DDCdatafr = DDC(datafr,DDCpars)

##  
##  The input data has 9 rows and 9 columns.
##  
##  The input data contained 2 non-numeric columns (variables).
##  Their column names are:
##  
## [1] name  logic
##  
##  These columns will be ignored in the analysis.
##  We continue with the remaining 7 numeric columns:
##  
## [1] i    V1   V2   V3   Vna  Vdis V0s 
##  
##  The data contained 1 columns that were identical to the case number
##  (number of the row).
##  Their column names are:
##  
## [1] i
##  
##  These columns will be ignored in the analysis.
##  We continue with the remaining 6 columns:
##  
## [1] V1   V2   V3   Vna  Vdis V0s 
##  
##  The data contained 1 discrete columns with 3 or fewer values.
##  Their column names are:
##  
## [1] Vdis
##  
##  These columns will be ignored in the analysis.
##  We continue with the remaining 5 columns:
##  
## [1] V1  V2  V3  Vna V0s
##  
##  The data contained 1 columns with zero or tiny median absolute deviation.
##  Their column names are:
##  
## [1] V0s
##  
##  These columns will be ignored in the analysis.
##  We continue with the remaining 4 columns:
##  
## [1] V1  V2  V3  Vna
##  
##  The final data set we will analyze has 9 rows and 4 columns.
##

remX = DDCdatafr$remX; dim(remX)

## [1] 9 4

cellMap(DDCdatafr$stdResid)

plot of chunk unnamed-chunk-3

# Red cells have higher value than predicted, blue cells lower,
# white cells are missing values, all other cells are yellow.

Small generated dataset

set.seed(12345) # for reproducibility
n <- 50; d <- 20
A <- matrix(0.9, d, d); diag(A) = 1
round(A[1:10,1:10],1) # true covariance matrix

##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]  1.0  0.9  0.9  0.9  0.9  0.9  0.9  0.9  0.9   0.9
##  [2,]  0.9  1.0  0.9  0.9  0.9  0.9  0.9  0.9  0.9   0.9
##  [3,]  0.9  0.9  1.0  0.9  0.9  0.9  0.9  0.9  0.9   0.9
##  [4,]  0.9  0.9  0.9  1.0  0.9  0.9  0.9  0.9  0.9   0.9
##  [5,]  0.9  0.9  0.9  0.9  1.0  0.9  0.9  0.9  0.9   0.9
##  [6,]  0.9  0.9  0.9  0.9  0.9  1.0  0.9  0.9  0.9   0.9
##  [7,]  0.9  0.9  0.9  0.9  0.9  0.9  1.0  0.9  0.9   0.9
##  [8,]  0.9  0.9  0.9  0.9  0.9  0.9  0.9  1.0  0.9   0.9
##  [9,]  0.9  0.9  0.9  0.9  0.9  0.9  0.9  0.9  1.0   0.9
## [10,]  0.9  0.9  0.9  0.9  0.9  0.9  0.9  0.9  0.9   1.0

library(MASS) # only needed for the following line:
x <- mvrnorm(n, rep(0,d), A)
x[sample(1:(n * d), 50, FALSE)] <- NA
x[sample(1:(n * d), 50, FALSE)] <- 10
x[sample(1:(n * d), 50, FALSE)] <- -10

# When not specifying DDCpars in the call to DDC
# all defaults are used:
DDCx <- DDC(x)

##  
##  The input data has 50 rows and 20 columns.

cellMap(DDCx$stdResid)

plot of chunk unnamed-chunk-4

# Red cells have higher value than predicted, blue cells lower,
# white cells are missing values, all other cells are yellow.

DDCx$DDCpars # These are the default options:

## $fracNA
## [1] 0.5
## 
## $numDiscrete
## [1] 3
## 
## $precScale
## [1] 1e-12
## 
## $cleanNAfirst
## [1] "automatic"
## 
## $tolProb
## [1] 0.99
## 
## $corrlim
## [1] 0.5
## 
## $combinRule
## [1] "wmean"
## 
## $returnBigXimp
## [1] FALSE
## 
## $silent
## [1] FALSE
## 
## $nLocScale
## [1] 25000
## 
## $fastDDC
## [1] FALSE
## 
## $standType
## [1] "1stepM"
## 
## $corrType
## [1] "gkwls"
## 
## $transFun
## [1] "wrap"
## 
## $nbngbrs
## [1] 100

names(DDCx)

##  [1] "DDCpars"         "colInAnalysis"   "rowInAnalysis"   "namesNotNumeric"
##  [5] "namesCaseNumber" "namesNAcol"      "namesNArow"      "namesDiscrete"  
##  [9] "namesZeroScale"  "remX"            "locX"            "scaleX"         
## [13] "Z"               "nbngbrs"         "ngbrs"           "robcors"        
## [17] "robslopes"       "deshrinkage"     "Xest"            "scalestres"     
## [21] "stdResid"        "indcells"        "Ti"              "medTi"          
## [25] "madTi"           "indrows"         "indall"          "indNAs"         
## [29] "Ximp"

# We will now go through these outputs one by one:

DDCx$colInAnalysis # all columns X1,...,X20 remain:

##  X1  X2  X3  X4  X5  X6  X7  X8  X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20

DDCx$rowInAnalysis # all rows 1,...,50 remain:

##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
## 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 
## 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50

DDCx$namesNotNumeric # no non-numeric columns:

## NULL

DDCx$namesCaseNumber # no column was the case number:

## NULL

DDCx$namesNAcol # no columns with too many NA's:

## NULL

DDCx$namesNArow # no columns with too many NA's:

## NULL

DDCx$namesDiscrete # no discrete columns:

## NULL

DDCx$namesZeroScale # no columns with scale = 0:

## NULL

dim(DDCx$remX) # remaining data matrix

## [1] 50 20

round(DDCx$locX,2) # robust location estimates of all 20 columns:

##    X1    X2    X3    X4    X5    X6    X7    X8    X9   X10   X11   X12   X13 
## -0.22 -0.33 -0.37 -0.23 -0.31 -0.42 -0.40 -0.25 -0.24 -0.17 -0.34 -0.31 -0.42 
##   X14   X15   X16   X17   X18   X19   X20 
## -0.33 -0.26 -0.23 -0.40 -0.30 -0.35 -0.30

round(DDCx$scaleX,2) # robust scale estimates of all 20 columns:

##   X1   X2   X3   X4   X5   X6   X7   X8   X9  X10  X11  X12  X13  X14  X15  X16 
## 1.32 1.36 1.12 1.15 1.23 1.35 1.07 1.45 1.04 0.96 1.47 1.37 1.22 1.40 1.40 1.42 
##  X17  X18  X19  X20 
## 1.47 1.31 1.44 1.35

round(DDCx$Z[1:10,1:10],1)

##      X1   X2   X3   X4   X5   X6   X7   X8   X9  X10
## 1  -0.3  0.1  0.1 -0.4  0.1 -0.4  0.2 -0.2 -0.1 -0.3
## 2   0.0 -0.1 -0.3 -0.4 -0.6 -7.1 -0.1 -0.8 -0.6 -0.3
## 3   0.1  0.3  0.1  0.6  0.7  0.4  0.5  0.4   NA  0.4
## 4   0.4  0.6  0.6   NA   NA  0.7  0.8 -6.7  0.8  0.2
## 5   0.2 -0.2   NA  0.0 -0.1 -0.1 -0.6  0.0  0.1 -0.4
## 6   1.6  1.6  2.0  1.8  1.5  1.6  1.6  1.4  2.2  1.8
## 7    NA -0.1 -0.1 -0.5 -0.2  0.1 -0.2  7.1  0.0 -0.5
## 8   0.5  0.6  0.3  0.6  0.4  0.0 -9.0  0.7  0.6  0.2
## 9   0.3  0.8  0.6  0.4  0.2  0.4  0.3  0.1  0.4  1.0
## 10  0.8  0.6  1.2  1.5  1.3  1.2   NA  0.6  1.1  0.6

# Robustly standardized dataset. Due to the high correlations,
# cells in the same row look similar (except for outlying cells).

DDCx$nbngbrs

## [1] 19

# For each column the code looked for up to 19 non-self neighbors (highly correlated columns).
# It goes through all of them, unless fastDDC is set to TRUE.

DDCx$ngbrs[1:3,]

##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,]    1   14   17    4   12    3   16    8   20    18    19     5    11    15
## [2,]    2   17   18   12   11   15   16   10   13     1     4     6     3    19
## [3,]    3   12   15    1   17    6   11    5    7     4    18    16    10     2
##      [,15] [,16] [,17] [,18] [,19] [,20]
## [1,]     6     2     9    10     7    13
## [2,]    14     7     5     9    20     8
## [3,]    14    19     8    20     9    13

# Shows the neighbors, e.g. the nearest non-self neighbor of X1 is X11, then X2,...

round(DDCx$robcors[1:3,],2)

##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,]    1 0.96 0.94 0.93 0.93 0.93 0.93 0.93 0.93  0.93  0.93  0.92  0.92  0.91
## [2,]    1 0.96 0.94 0.93 0.93 0.92 0.92 0.92 0.91  0.91  0.91  0.91  0.90  0.90
## [3,]    1 0.96 0.93 0.93 0.92 0.92 0.92 0.91 0.91  0.91  0.91  0.91  0.90  0.90
##      [,15] [,16] [,17] [,18] [,19] [,20]
## [1,]  0.91  0.91  0.91  0.91  0.90  0.89
## [2,]  0.89  0.89  0.88  0.88  0.87  0.85
## [3,]  0.90  0.90  0.90  0.90  0.89  0.89

# Robust correlations with these neighbors. In each row the correlations
# are sorted by decreasing absolute value. 

round(DDCx$robslopes[1:3,],2)

##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,]    1 0.99 0.99 0.85 1.03 0.76 1.02 1.02 0.93  0.95  0.93  0.84  0.93  0.99
## [2,]    1 1.00 0.91 0.99 0.91 0.94 0.99 0.77 0.85  0.94  0.82  0.88  0.76  0.94
## [3,]    1 1.18 1.17 1.16 1.07 1.07 1.04 1.01 0.91  0.97  1.14  1.15  0.79  1.08
##      [,15] [,16] [,17] [,18] [,19] [,20]
## [1,]  0.91  0.88  0.70  0.70  0.73  0.82
## [2,]  0.94  0.76  0.85  0.66  0.84  0.89
## [3,]  1.13  1.16  1.07  1.06  0.83  1.02

# For each column, the slope of each neighbor predicting it.
# For instance, X1 is predicted by its first neighbor with 
# slope 0.97 and by its second neighbor with slope 0.81 .

round(DDCx$deshrinkage,2)

##   X1   X2   X3   X4   X5   X6   X7   X8   X9  X10  X11  X12  X13  X14  X15  X16 
## 1.09 1.08 1.08 1.10 1.10 1.08 1.08 1.10 1.11 1.13 1.10 1.07 1.09 1.09 1.08 1.06 
##  X17  X18  X19  X20 
## 1.10 1.06 1.10 1.11

# For each column, the factor by which its prediction is multiplied.

round(DDCx$Xest[1:12,1:10],2) # the estimated cells of remX:

##       X1    X2    X3    X4    X5    X6    X7    X8    X9   X10
## 1  -0.48 -0.58 -0.61 -0.48 -0.55 -0.69 -0.63 -0.52 -0.49 -0.41
## 2  -0.53 -0.63 -0.67 -0.54 -0.63 -0.75 -0.69 -0.59 -0.55 -0.46
## 3   0.22  0.12  0.07  0.22  0.14  0.04  0.02  0.23  0.20  0.26
## 4   0.59  0.50  0.44  0.58  0.51  0.44  0.35  0.63  0.56  0.61
## 5  -0.45 -0.58 -0.61 -0.46 -0.54 -0.66 -0.63 -0.50 -0.46 -0.40
## 6   1.77  1.68  1.61  1.71  1.66  1.66  1.46  1.86  1.75  1.74
## 7  -0.42 -0.53 -0.56 -0.43 -0.51 -0.62 -0.59 -0.46 -0.42 -0.36
## 8   0.34  0.23  0.17  0.32  0.25  0.14  0.10  0.35  0.31  0.36
## 9   0.36  0.27  0.20  0.34  0.26  0.17  0.13  0.35  0.33  0.40
## 10  0.97  0.86  0.80  0.95  0.89  0.82  0.70  1.02  0.92  0.97
## 11  0.18  0.07  0.03  0.16  0.09 -0.01 -0.03  0.17  0.14  0.21
## 12 -1.62 -1.73 -1.73 -1.58 -1.69 -1.83 -1.68 -1.71 -1.59 -1.52

round(DDCx$stdResid[1:12,1:10],1)

##      X1    X2   X3   X4   X5    X6    X7    X8    X9  X10
## 1  -0.5   1.1  1.4 -0.6  1.1  -0.7   1.3  -0.1   0.4 -0.3
## 2   0.9   0.5 -0.1 -0.4 -1.5 -25.3   0.7  -2.0  -0.9  0.2
## 3  -1.0   0.0 -1.1  0.6  1.2   0.2   0.3   0.2    NA -0.3
## 4  -0.8   0.1 -0.7   NA   NA   0.4   0.1 -28.1   0.2 -2.1
## 5   1.5  -0.2   NA  0.8  0.2   0.5  -1.1   0.7   1.0 -0.6
## 6   0.5   0.5  0.8  0.4 -0.4   0.1  -0.6  -0.2   0.8 -0.6
## 7    NA   0.2  0.1 -1.1 -0.3   0.8  -0.1  27.6   0.5 -1.0
## 8   0.3   0.8 -0.6  0.5 -0.1  -1.4 -30.3   0.9   0.2 -1.1
## 9  -0.3   1.4  0.2 -0.4 -0.9   0.0  -0.6  -1.4  -0.5  1.6
## 10 -0.3  -0.8  0.5  1.5  1.1   0.9    NA  -1.0  -0.1 -1.9
## 11   NA  -0.2  1.1 -1.4 -1.0   0.1   0.5   0.1  -0.8 -1.7
## 12 -2.3 -21.1  0.7 -0.6  0.1   1.7   0.8  -0.4 -22.1 -2.0

# The standardized residuals of the cells. Note the NA's and some
# large positive and negative cell residuals.

qqnorm(as.vector(DDCx$stdResid)) # Note the far outliers on both sides:

plot of chunk unnamed-chunk-6

as.vector(DDCx$indcells) # indices of the cells that were flagged by DDC:

##   [1]  27  34  35  41  62  76  79  80  90  93 100 114 142 146 164 178 180 195
##  [19] 197 219 225 237 245 252 267 273 281 284 292 297 299 308 342 354 357 367
##  [37] 388 396 397 412 430 433 439 465 482 506 529 534 548 553 561 564 566 585
##  [55] 594 597 628 631 643 648 654 663 669 675 685 696 699 704 705 715 728 736
##  [73] 754 757 758 760 784 806 816 817 831 836 843 868 881 886 887 896 904 905
##  [91] 910 921 925 942 950 956 978 981 992 996

plot(DDCx$Ti) # the Ti values of the rows. None are high.

plot of chunk unnamed-chunk-6

qqnorm(DDCx$Ti)

plot of chunk unnamed-chunk-6

DDCx$medTi # median of the raw Ti (used to standardize Ti):

## [1] 0.001298725

DDCx$madTi # median absolute deviation of the raw Ti (used to standardize Ti):

## [1] 0.06241016

DDCx$indrows # numeric(0) means no rows are flagged:

## numeric(0)

as.vector(DDCx$indall) # indices of the flagged cells, including those in flagged rows:

##   [1]  27  34  35  41  62  76  79  80  90  93 100 114 142 146 164 178 180 195
##  [19] 197 219 225 237 245 252 267 273 281 284 292 297 299 308 342 354 357 367
##  [37] 388 396 397 412 430 433 439 465 482 506 529 534 548 553 561 564 566 585
##  [55] 594 597 628 631 643 648 654 663 669 675 685 696 699 704 705 715 728 736
##  [73] 754 757 758 760 784 806 816 817 831 836 843 868 881 886 887 896 904 905
##  [91] 910 921 925 942 950 956 978 981 992 996

as.vector(DDCx$indNAs) # indices of the missing cells:

##  [1]   7  11  28  44  97 105 154 174 176 184 204 249 264 310 348 403 427 443 489
## [20] 491 517 523 550 556 589 602 606 609 615 624 645 646 658 700 725 798 803 814
## [39] 867 870 945 954 979 980 999

round(DDCx$Ximp[1:10,1:10],2)

##       X1    X2    X3    X4    X5    X6    X7    X8    X9   X10
## 1  -0.66 -0.16 -0.22 -0.66 -0.22 -0.94 -0.20 -0.57 -0.32 -0.49
## 2  -0.21 -0.45 -0.69 -0.66 -1.09 -0.75 -0.46 -1.36 -0.90 -0.41
## 3  -0.13  0.13 -0.24  0.43  0.51  0.11  0.11  0.31  0.20  0.19
## 4   0.32  0.53  0.25  0.58  0.51  0.58  0.40  0.63  0.64  0.06
## 5   0.07 -0.65 -0.61 -0.20 -0.48 -0.49 -1.01 -0.24 -0.10 -0.56
## 6   1.93  1.89  1.85  1.85  1.55  1.69  1.27  1.77  2.07  1.59
## 7  -0.42 -0.46 -0.54 -0.78 -0.60 -0.34 -0.62 -0.46 -0.22 -0.63
## 8   0.44  0.53  0.01  0.47  0.23 -0.37  0.10  0.71  0.37  0.07
## 9   0.24  0.81  0.26  0.21 -0.03  0.16 -0.06 -0.16  0.13  0.81
## 10  0.88  0.55  0.94  1.45  1.24  1.17  0.70  0.64  0.88  0.46

# The imputed matrix. Both the cellwise outliers and the missing values
# are replaced by their predicted values.

round((DDCx$Ximp - DDCx$remX)[1:10,1:10],2)

##    X1 X2 X3 X4 X5   X6   X7     X8 X9 X10
## 1   0  0  0  0  0 0.00  0.0   0.00  0   0
## 2   0  0  0  0  0 9.25  0.0   0.00  0   0
## 3   0  0  0  0  0 0.00  0.0   0.00 NA   0
## 4   0  0  0 NA NA 0.00  0.0  10.63  0   0
## 5   0  0 NA  0  0 0.00  0.0   0.00  0   0
## 6   0  0  0  0  0 0.00  0.0   0.00  0   0
## 7  NA  0  0  0  0 0.00  0.0 -10.46  0   0
## 8   0  0  0  0  0 0.00 10.1   0.00  0   0
## 9   0  0  0  0  0 0.00  0.0   0.00  0   0
## 10  0  0  0  0  0 0.00   NA   0.00  0   0

# The nonzero values and the NA's correspond to imputed cells.

TopGear dataset

The Top Gear data contains information on 297 cars.

library(robustHD)
data(TopGear)

dim(TopGear)

## [1] 297  32

rownames(TopGear)[1:13] # "1" to "297" are not useful names

##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13"

rownames(TopGear) = paste(TopGear[,1],TopGear[,2]) 
# Now the rownames are make and model of the cars.
rownames(TopGear)[165] = "Mercedes-Benz G" # name was too long
myTopGear = TopGear[,-31] # removes the subjective variable `Verdict'

# Transform some variables to get roughly gaussianity in the center:
transTG = myTopGear
transTG$Price = log(myTopGear$Price)
transTG$Displacement = log(myTopGear$Displacement)
transTG$BHP = log(myTopGear$BHP)
transTG$Torque = log(myTopGear$Torque)
transTG$TopSpeed = log(myTopGear$TopSpeed)

# Run the DDC method:
DDCpars = list(fastDDC = FALSE, silent = TRUE)
DDCtransTG = DDC(transTG,DDCpars)

##  
##  The final data set we will analyze has 296 rows and 11 columns.
##

# With DDCpars = list(fastDDC = FALSE, silent = FALSE) we obtain more information:
# 
# The input data has 297 rows and 31 columns.
# 
# The input data contained 19 non-numeric columns (variables).
# Their column names are:
#   
#  [1] Maker         Model              Type               Fuel              
#  [5] DriveWheel    AdaptiveHeadlights AdjustableSteering AlarmSystem       
#  [9] Automatic     Bluetooth          ClimateControl     CruiseControl     
# [13] ElectricSeats Leather            ParkingSensors     PowerSteering     
# [17] SatNav        ESP                Origin            
# 
# These columns will be ignored in the analysis.
# We continue with the remaining 12 numeric columns:
#   
# [1] Price Cylinders Displacement BHP   Torque Acceleration TopSpeed    
# [8] MPG   Weight    Length       Width Height      
# 
# The data contained 1 rows with over 50% of NAs.
# Their row names are:
#   
#   [1] Citroen C5 Tourer
# 
# These rows will be ignored in the analysis.
# We continue with the remaining 296 rows:
#   
#   [1] Alfa Romeo Giulietta             Alfa Romeo MiTo                 
# .......
# [295] Volvo XC70                       Volvo XC90                      
# 
# The data contained 1 columns with zero or tiny median absolute deviation.
# Their column names are:
# 
# [1] Cylinders
# 
# These columns will be ignored in the analysis.
# We continue with the remaining 11 columns:
# 
# [1] Price  Displacement BHP   Torque Acceleration TopSpeed MPG         
# [8] Weight Length       Width Height      
# 
# The final data set we will analyze has 296 rows and 11 columns.

remX = DDCtransTG$remX # the remaining part of the dataset
dim(remX)

## [1] 296  11

colSums(is.na(remX)) # There are still NAs, mainly in `Weight':

##        Price Displacement          BHP       Torque Acceleration     TopSpeed 
##            0            8            3            3            0            3 
##          MPG       Weight       Length        Width       Height 
##           11           32           10           15           10

# Analyze the data by column:
standX = scale(remX,apply(remX,2,median,na.rm = TRUE), 
               apply(remX,2,mad,na.rm = TRUE))
dim(standX)

## [1] 296  11

round(standX[1:5,],1) # has NAs where remX does

##                          Price Displacement  BHP Torque Acceleration TopSpeed
## Alfa Romeo Giulietta      -0.4         -0.4 -0.7    0.0          0.6     -0.5
## Alfa Romeo MiTo           -0.9         -0.7 -0.7   -1.6          0.4     -0.4
## Aston Martin Cygnet        0.3         -0.8 -0.8   -1.7          0.7     -0.9
## Aston Martin DB9           2.7          2.1  1.9    1.2         -1.2      2.0
## Aston Martin DB9 Volante   2.8          2.1  1.9    1.2         -1.2      2.0
##                           MPG Weight Length Width Height
## Alfa Romeo Giulietta      1.0   -0.3   -0.3  -0.2   -0.2
## Alfa Romeo MiTo           0.1   -1.0   -0.9  -1.1   -0.3
## Aston Martin Cygnet       0.6   -1.3   -3.1  -1.5    0.1
## Aston Martin DB9         -1.7    0.8    0.6    NA   -1.6
## Aston Martin DB9 Volante -1.7    1.0    0.6    NA   -1.6

transTGcol = remX
transTGcol[abs(standX) > sqrt(qchisq(0.99,1))] = NA
round(transTGcol[1:5,],1) # has NAs in outlying cells as well:

##                          Price Displacement BHP Torque Acceleration TopSpeed
## Alfa Romeo Giulietta      10.0          7.4 4.7    5.5         11.3      4.7
## Alfa Romeo MiTo            9.6          7.2 4.7    4.6         10.7      4.8
## Aston Martin Cygnet       10.3          7.2 4.6    4.5         11.8      4.7
## Aston Martin DB9            NA          8.7 6.2    6.1          4.6      5.2
## Aston Martin DB9 Volante    NA          8.7 6.2    6.1          4.6      5.2
##                          MPG Weight Length Width Height
## Alfa Romeo Giulietta      64   1385   4351  1798   1465
## Alfa Romeo MiTo           49   1090   4063  1720   1446
## Aston Martin Cygnet       56    988     NA  1680   1500
## Aston Martin DB9          19   1785   4720    NA   1282
## Aston Martin DB9 Volante  19   1890   4720    NA   1282

# Make untransformed submatrix of X for labeling the cells in the plot:
tempX = myTopGear[DDCtransTG$rowInAnalysis,DDCtransTG$colInAnalysis]
tempX$Price = tempX$Price/1000 # to avoid printing long numbers
dim(tempX)

## [1] 296  11

# Show the following 17 cars in the cellmap:
showrows = c(12,42,56,73,81,94,99,135,150,164,176,198,209,215,234,241,277)

# Make two ggplot2 objects:
ggpcol = cellMap(standX, showcellvalues="D", D=tempX, 
                 mTitle="By column", showrows=showrows,
                 sizecellvalues = 0.6, adjustrowlabels=0.5)  
plot(ggpcol)

plot of chunk unnamed-chunk-9

ggpDDC = cellMap(DDCtransTG$stdResid, showcellvalues="D", D=tempX,  
                 mTitle="DetectDeviatingCells", showrows=showrows,
                 sizecellvalues = 0.6, adjustrowlabels=0.5)  
plot(ggpDDC)

plot of chunk unnamed-chunk-10

# Creating the pdf:
# pdf("cellMap_TopGear.pdf", width = 12, height = 10)
# gridExtra::grid.arrange(ggpcol,ggpDDC,nrow=1) # combines 2 plots in a figure
# dev.off()

Analyzing new data by DDCpredict

We now consider the 17 cars shown in the cellmap as a `new’ dataset.

# Top Gear dataset: prediction of "new" data
############################################
# For comparison we first remake the cell map of the entire dataset, but now 
# showing the values of the residuals instead of the data values:

dim(remX) # 296 11

## [1] 296  11

ggpDDC = cellMap(DDCtransTG$stdResid, showcellvalues="R", 
                 sizecellvalues = 0.7, mTitle="DetectDeviatingCells", 
                 showrows=showrows, adjustrowlabels=0.5)
plot(ggpDDC)

plot of chunk unnamed-chunk-11

Define the “initial” dataset as the rows not in these 17:

initX = remX[-showrows,]
dim(initX) # 279 11

## [1] 279  11

# Fit initX:
DDCinitX = DDC(initX,DDCpars=DDCpars)

##  
##  The final data set we will analyze has 278 rows and 11 columns.
##

Define the “new” dataset, and apply DDCpredict to it:

newX = remX[showrows,]
dim(newX) # 17  11

## [1] 17 11

# Make predictions by DDCpredict. 
# Its inputs are:
#   Xnew       : the new data (test data)
#   InitialDDC : Must be provided.
#   DDCpars    : the input options to be used for the prediction.
#                By default the options of InitialDDC are used. 

predictDDC = DDCpredict(newX,DDCinitX)

names(DDCinitX)

##  [1] "DDCpars"         "colInAnalysis"   "rowInAnalysis"   "namesNotNumeric"
##  [5] "namesCaseNumber" "namesNAcol"      "namesNArow"      "namesDiscrete"  
##  [9] "namesZeroScale"  "remX"            "locX"            "scaleX"         
## [13] "Z"               "nbngbrs"         "ngbrs"           "robcors"        
## [17] "robslopes"       "deshrinkage"     "Xest"            "scalestres"     
## [21] "stdResid"        "indcells"        "Ti"              "medTi"          
## [25] "madTi"           "indrows"         "indall"          "indNAs"         
## [29] "Ximp"

# For comparison with:

names(predictDDC) # Fewer, since DDCpredict does not call checkDataSet:

##  [1] "DDCpars"     "locX"        "scaleX"      "Z"           "nbngbrs"    
##  [6] "ngbrs"       "robcors"     "robslopes"   "deshrinkage" "Xest"       
## [11] "scalestres"  "stdResid"    "indcells"    "Ti"          "medTi"      
## [16] "madTi"       "indrows"     "indNAs"      "indall"      "Ximp"

# If you specify the parameters the result is the same:
predictDDC2 = DDCpredict(newX,DDCinitX,DDCpars=DDCpars)
all.equal(predictDDC,predictDDC2) # TRUE

## [1] TRUE

ggpnew = cellMap(predictDDC$stdResid, showcellvalues="R",
                 sizecellvalues = 0.7, mTitle="DDCpredict",
                 adjustrowlabels=0.5) 
plot(ggpnew) # Looks quite similar to the result using the entire dataset:

plot of chunk unnamed-chunk-14

# Creating the pdf:
# pdf("TopGear_DDCpredict.pdf",width=12,height=10)
# gridExtra::grid.arrange(ggpDDC,ggpnew,nrow=1)
# dev.off()

Philips data

The philips data contains 9 measurements of TV parts from a production line.

data(data_philips)
dim(data_philips)

## [1] 677   9

colnames(data_philips) = c("X1","X2","X3","X4","X5","X6","X7","X8","X9")

DDCphilips = DDC(data_philips)

##  
##  The input data has 677 rows and 9 columns.

qqnorm(as.vector(DDCphilips$Z)) # rather gaussian, here we only see 2 outliers:

plot of chunk unnamed-chunk-15

round(DDCphilips$stdResid[1:12,],1) # the standardized residuals:

##      X1   X2   X3   X4   X5   X6  X7   X8   X9
## 1  -0.9  2.2  0.0  1.2  0.8 -2.9 2.1  0.1 -1.4
## 2  -0.5  2.0  0.0  3.9  0.8 -1.1 3.5 -0.4 -1.7
## 3  -1.0  2.3 -0.4  0.7  0.6 -2.0 1.4  0.0 -1.7
## 4  -0.6  2.3 -0.1  3.8  1.1 -0.9 3.4 -0.5 -2.0
## 5  -0.6  2.8 -0.2  3.4  0.9 -0.9 3.4  0.2 -1.9
## 6  -0.7  2.6  0.0  4.4  1.0 -1.1 4.2 -0.1 -2.0
## 7   1.0 -0.1 -1.0 -0.3 -1.2 -1.0 1.4  0.8 -0.6
## 8  -0.4 -1.4 -0.7  1.8 -0.7 -2.8 1.9 -0.3 -1.7
## 9   1.0 -0.2  0.0 -0.1 -1.2 -2.1 1.6  1.0 -0.9
## 10  0.2 -1.3 -0.1  1.0 -0.5 -2.9 2.0 -0.1 -1.5
## 11 -0.1 -0.3 -0.1  0.2 -1.1 -1.5 1.9  0.8 -0.8
## 12 -0.1 -0.4 -1.5  2.5  0.0 -3.4 2.3  0.3 -2.5

DDCphilips$indcells # indices of the cells that were flagged:

##   [1]   55   70   95  104  116  144  151  156  161  170  174  175  324  468  682
##  [16]  683  712  747  750  792  985  986  987  988  989  991  992 1006 1007 1008
##  [31] 1010 1011 1196 1199 1450 1452 1455 1474 1529 1787 2033 2035 2036 2037 2045
##  [46] 2047 2094 2101 2114 2116 2126 2135 2267 2312 2355 2365 2464 2499 2592 2743
##  [61] 2775 2777 3120 3216 3226 3227 3228 3248 3386 3393 3395 3397 3399 3401 3412
##  [76] 3413 3414 3415 3416 3417 3423 3424 3425 3426 3427 3428 3430 3431 3437 3445
##  [91] 3456 3457 3460 3464 3465 3466 3468 3469 3470 3990 4064 4066 4067 4068 4076
## [106] 4078 4088 4145 4147 4792 4794 5036 5037 5168 5169 5170 5171 5172 5174 5175
## [121] 5176 5230 5238 5241 5246 5248 5250 5253 5256 5258 5259 5260 5262 5263 5265
## [136] 5267 5268 5270 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5569
## [151] 5869 5873 5882 5890 5902

DDCphilips$indrows # flagged rows:

## [1]  26  56  84 334 491

We also apply the rowwise method MCD to detect outlying rows:

library(robustbase) # for covMcd

MCDphilips = robustbase::covMcd(data_philips)
indrowsMCD = which(mahalanobis(data_philips,MCDphilips$center,
                               MCDphilips$cov) > qchisq(0.975,df=9))

plot(sqrt(mahalanobis(data_philips,MCDphilips$center,MCDphilips$cov)),
     main="Philips data",ylab="Robust distances",xlab="",pch=20)
abline(h=sqrt(qchisq(0.975,df=9))) # this horizontal line is the cutoff.

plot of chunk unnamed-chunk-17

# dev.copy(pdf,"Figure_philips_left.pdf",width=10,height=4)
# dev.off()

# cellMaps with rectangular blocks:

ggpMCDphilips = cellMap(data_philips, indrows=indrowsMCD,
                        mTitle="MCD", nrowsinblock=15,
                        ncolumnsinblock=1, drawCircles = TRUE)

## No rowblocklabels were given, so they are constructed automatically.

plot(ggpMCDphilips)

plot of chunk unnamed-chunk-18

ggpDDCphilips = cellMap(DDCphilips$stdResid, indrows=DDCphilips$indrows,
                        mTitle="DetectDeviatingCells", nrowsinblock=15,
                        ncolumnsinblock=1, drawCircles = TRUE)

## No rowblocklabels were given, so they are constructed automatically.

plot(ggpDDCphilips)

plot of chunk unnamed-chunk-18

# dev.copy(pdf,"Figure_philips_right.pdf",width=6,height=12)
# dev.off()

Mortality dataset

This dataset contains the mortality of French males in the years 1816 to 2013.

data(data_mortality)
dim(data_mortality)

## [1] 198  91

# 198  91
rownames(data_mortality)[1:5]

## [1] "1816" "1817" "1818" "1819" "1820"

colnames(data_mortality)[1:5]

## [1] "0" "1" "2" "3" "4"

DDCpars = list(fastDDC = FALSE, silent = TRUE)
DDCmortality = DDC(data_mortality,DDCpars) # 1 second

remX = DDCmortality$remX
dim(remX)

## [1] 198  91

We also apply the rowwise method ROBPCA to detect outlying rows:

library(rrcov) # contains ROBPCA

PCAmortality = rrcov::PcaHubert(data_mortality,alpha=0.75,scale=FALSE)

ggpROBPCA = cellMap(remX, indrows=which(PCAmortality@flag==FALSE),
                    mTitle="By row", nrowsinblock=5,
                    ncolumnsinblock=5, rowtitle = "Years",
                    columntitle = "Age", sizetitles = 1.5,
                    drawCircles = TRUE)

## No rowblocklabels were given, so they are constructed automatically.
## No columnblocklabels were given, so they are constructed automatically.

plot(ggpROBPCA)

plot of chunk unnamed-chunk-21

ggpDDC = cellMap(DDCmortality$stdResid, 
                 mTitle="DetectDeviatingCells", nrowsinblock=5,
                 ncolumnsinblock=5, rowtitle = "Years",
                 columntitle = "Age", sizetitles = 1.5)

## No rowblocklabels were given, so they are constructed automatically.
## No columnblocklabels were given, so they are constructed automatically.

plot(ggpDDC) # Leads to a detailed interpretation:

plot of chunk unnamed-chunk-21

# pdf("cellmap_mortality.pdf",width=9,height=8)
# gridExtra::grid.arrange(ggpROBPCA,ggpDDC,nrow=1)
# dev.off()

To illustrate the ability to make row blocks and column blocks of arbitrary sizes, we create the following artificial groupings:

rowblocksizes = c(84,14,5,21,6,35,33)
rowlabels = c("19th century","1900-1913","WW1","1919-1939",
"WW2","1946-1980","recent")
colblocksizes = c(5,5,10,10,10,10,10,10,10,10,1)
collabels = c("upto 4","5 to 9","10 to 19","20 to 29","30 to 39","40 to 49","50 to 59","60 to 69","70 to 79","80 to 89","90+")

ggpDDC = cellMap(DDCmortality$stdResid,
                 mTitle = "Cellmap with manual blocks",
                 manualrowblocksizes = rowblocksizes,
                 rowblocklabels = rowlabels,
                 manualcolumnblocksizes = colblocksizes,
                 columnblocklabels = collabels,
                 rowtitle = "Epochs",
                 columntitle = "Age groups",
                 sizetitles = 1.2)
# pdf("cellmap_mortality_manual_blocks.pdf",width=5,height=3)
plot(ggpDDC)

plot of chunk unnamed-chunk-22

# dev.off()

Glass dataset

The glass data consists of spectra with 750 wavelengths of 180 archaeological glass samples.

data(data_glass)
DDCpars = list(fastDDC = FALSE, silent = TRUE)
DDCglass = DDC(data_glass,DDCpars) # takes 8 seconds

##  
##  The final data set we will analyze has 180 rows and 737 columns.
##

remX = DDCglass$remX
# With DDCpars$silent = FALSE we obtain more information:
#
#  The input data has 180 rows and 750 columns.
#
#  The data contained 11 discrete columns with 3 or fewer values.
#  Their column names are:
#
#  [1] V1  V2  V3  V4  V5  V6  V7  V8  V9  V10 V11
#
#  These columns will be ignored in the analysis.
#  We continue with the remaining 739 columns:
#  
#   [1] V12  V13  V14  V15  V16  V17  V18  V19  V20  V21  V22  V23  V24  V25
#    ......
# [729] V740 V741 V742 V743 V744 V745 V746 V747 V748 V749 V750
#
#  The data contained 2 columns with zero or tiny median absolute deviation.
#  Their column names are:
#
# [1] V12 V13
#
#  These columns will be ignored in the analysis.
#  We continue with the remaining 737 columns:
#
#   [1] V14  V15  V16  V17  V18  V19  V20  V21  V22  V23  V24  V25  V26  V27
#    ......
# [729] V742 V743 V744 V745 V746 V747 V748 V749 V750
#
#  The final data set we will analyze has 180 rows and 737 columns.

dim(remX)

## [1] 180 737

We will compare this with the faster approximate algorithm of DDC, obtained by the option fastDDC=TRUE:

fastDDCpars = list(fastDDC = TRUE, silent = TRUE)
fastDDCglass = DDC(data_glass, fastDDCpars) # takes 2 seconds

##  
##  The final data set we will analyze has 180 rows and 737 columns.
##

remXfast = fastDDCglass$remX
all.equal(remX,remXfast) # The remaining data is the same:

## [1] TRUE

We also apply the rowwise method ROBPCA to detect outlying rows:

library(rrcov) # contains ROBPCA

PCAglass = rrcov::PcaHubert(remX,alpha=0.75,scale=FALSE)

n = nrow(remX)
nrowsinblock = 5
rowtitle = "glass samples"
rowlabels = rep("",floor(n/nrowsinblock));
rowlabels[1] = "1"
rowlabels[floor(n/nrowsinblock)] = "n";

d = ncol(remX)
ncolumnsinblock = 5
columntitle = "wavelengths"
columnlabels = rep("",floor(d/ncolumnsinblock));
columnlabels[1] = "1";
columnlabels[floor(d/ncolumnsinblock)] = "d"

ggpROBPCA = cellMap(matrix(0,n,d),
                    indrows=which(PCAglass@flag==FALSE),
                    rowblocklabels=rowlabels,
                    columnblocklabels=columnlabels,
                    mTitle="By row", nrowsinblock=5,
                    ncolumnsinblock=5,
                    rowtitle="glass samples",
                    columntitle="wavelengths",
                    sizetitles=1.2,
                    columnangle=0,
                    drawCircles = TRUE)
plot(ggpROBPCA)

plot of chunk unnamed-chunk-26

ggpDDC = cellMap(DDCglass$stdResid,
                 indrows=DDCglass$indrows,
                 rowblocklabels=rowlabels,
                 columnblocklabels=columnlabels,
                 mTitle="DDC", nrowsinblock=5,
                 ncolumnsinblock=5,
                 rowtitle="glass samples",
                 columntitle="wavelengths",
                 sizetitles=1.2,
                 columnangle=0,
                 drawCircles = TRUE)
plot(ggpDDC)

plot of chunk unnamed-chunk-26

# pdf("cellmap_glass_ROBPCA_DDC.pdf",width=8,height=6)
# gridExtra::grid.arrange(ggpROBPCA,ggpDDC,ncol=1)
# dev.off()

ggpfastDDC = cellMap(fastDDCglass$stdResid,
                     indrows=fastDDCglass$indrows,
                     rowblocklabels=rowlabels,
                     columnblocklabels=columnlabels,
                     mTitle="fastDDC", nrowsinblock=5,
                     ncolumnsinblock=5,
                     rowtitle="glass samples",
                     columntitle="wavelengths",
                     sizetitles=1.2,
                     columnangle=0,
                     drawCircles = TRUE)
plot(ggpfastDDC)

plot of chunk unnamed-chunk-26

# pdf("cellmap_glass_DDC_fastDDC.pdf",width=8,height=6)
# gridExtra::grid.arrange(ggpDDC,ggpfastDDC,ncol=1)
# dev.off()

DDC examples

Raymaekers, J. and Rousseeuw, P.J.

2023-10-25

Introduction

Example with row and column selection

Small generated dataset

TopGear dataset

Analyzing new data by DDCpredict

Define the “initial” dataset as the rows not in these 17:

Define the “new” dataset, and apply DDCpredict to it:

Philips data

We also apply the rowwise method MCD to detect outlying rows:

Mortality dataset

We also apply the rowwise method ROBPCA to detect outlying rows:

Glass dataset

We will compare this with the faster approximate algorithm of DDC, obtained by the option fastDDC=TRUE:

We also apply the rowwise method ROBPCA to detect outlying rows: