library(cellWise)
In this vignette we illustrate the use of MacroPCA with fixed center in the context of correspondence analysis. It reproduces the examples in the report Raymaekers and Rousseeuw (2022), Challenges of cellwise outliers.
In this section we analyze the clothes data through correspondence analysis. We start by loading the data, and constructing the matrix S:
data("data_clothes")
X <- data_clothes
dim(X)
## [1] 28 5
# create matrix S as in paper:
sum(X) # total count is 4373
## [1] 4373
P <- X / sum(X) # relative frequencies that add up to 1
rvec <- rowSums(P) # vector of row totals
cvec <- colSums(P) # vector of column totals
R <- X / rowSums(X) # row profiles
rowSums(R) # all 1, OK
## GB SK BG IE BE ES PL FI GR HU SI NL IT RO AT FR HR SE CZ DK DE LT PT EE LU MT
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## LV CY
## 1 1
S <- diag(sqrt(rvec)) %*% (R - rep(1, nrow(X)) %*%
t(cvec)) %*% diag(1/sqrt(cvec))
dimnames(S) <- dimnames(X)
round(S, 3)
## X1 X2 X3 X4 X5
## GB 0.100 0.006 -0.042 -0.030 -0.039
## SK 0.208 0.002 -0.071 -0.066 -0.085
## BG 0.027 0.051 0.003 -0.025 -0.058
## IE -0.057 -0.028 0.008 0.021 0.059
## BE -0.054 -0.035 0.031 0.037 0.025
## ES -0.045 -0.020 -0.014 0.045 0.036
## PL -0.041 0.001 -0.001 0.025 0.019
## FI -0.036 -0.013 0.018 0.017 0.016
## GR 0.047 -0.015 -0.004 -0.003 -0.026
## HU -0.019 0.009 -0.004 0.000 0.015
## SI -0.028 -0.022 -0.001 0.023 0.030
## NL 0.003 -0.012 -0.013 0.006 0.015
## IT -0.033 0.010 0.011 0.002 0.012
## RO 0.054 0.038 0.011 -0.041 -0.066
## AT -0.042 -0.019 0.004 0.003 0.056
## FR -0.025 -0.009 0.018 -0.006 0.024
## HR -0.042 -0.014 0.014 -0.001 0.045
## SE -0.048 0.000 0.031 0.006 0.013
## CZ -0.041 -0.003 -0.001 0.008 0.039
## DK -0.048 -0.003 0.014 0.024 0.015
## DE -0.034 -0.005 -0.002 0.009 0.034
## LT -0.056 -0.012 0.020 0.030 0.022
## PT -0.003 0.028 0.003 -0.021 -0.008
## EE -0.021 -0.010 0.004 0.008 0.021
## LU -0.004 -0.013 -0.001 0.010 0.008
## MT 0.052 -0.019 0.011 -0.023 -0.022
## LV 0.033 0.048 -0.001 -0.030 -0.053
## CY -0.038 0.008 0.019 0.041 -0.027
d <- ncol(S)
# We verify that the points of S are on a hyperplane by construction:
(eigvals <- eigen(t(S) %*% S)$values)
## [1] 1.446489e-01 1.855747e-02 6.326238e-03 4.134890e-03 1.009893e-17
Now we analyze the data using DDC to identify suspicious cells:
DDC.out <- DDC(S)
##
## The input data has 28 rows and 5 columns.
ggp <- cellMap(R = DDC.out$stdResid,
mTitle = "clothes data",
rowtitle = "countries",
columntitle = "brackets")
# pdf("clothes_cellmap.pdf", width = 3, height = 6)
plot(ggp)
# dev.off()
United Kingdom (GB), Slovakia, and Romania stand out. The outlier removal in Riani et al (2022) takes out GB, SK, BG, GR, RO, MT, LV which are precisely the countries with flagged cells here! Their analysis doesn’t say much about why these countries are outlying, but here we see which brackets they import unusually much or little from.
For instance, the cellmap suggests that GB imports a lot of cheap clothing. The webpage
says that much is imported from Bangladesh, China, and India.
countryInds <- which(rownames(S) %in%
c("GB", "SK", "MT", "GR", "BG", "LV", "RO"))
matplot(t(S[countryInds, ]), pch = 16, col = 1:7)
lines(apply(S, 2, median), lwd = 3) # median profile
# The profile of these countries deviate in a similar way:
# they trade a lot of cheap clothes, and fewer expensive ones.
We now continue with classical correspondence analysis.
svd.out <- svd(S)
svd.out$d# S is indeed singular:
## [1] 3.803274e-01 1.362258e-01 7.953765e-02 6.430311e-02 2.723455e-17
(svd.out$d)^2 - eigen(t(S) %*% S)$values # ~0
## [1] 1.665335e-16 -1.040834e-17 7.806256e-18 6.071532e-18 -1.009893e-17
diff(c(0,cumsum(svd.out$d^2)/sum(svd.out$d^2)))
## [1] 0.83290720 0.10685628 0.03642729 0.02380923 0.00000000
# This can be shown in a scree plot.
# For plotting the rows:
rowproj <- diag(1/sqrt(rvec)) %*% svd.out$u %*% diag(svd.out$d)
# For plotting the columns = variables:
colproj <- diag(1/sqrt(cvec)) %*% svd.out$v %*% diag(svd.out$d)
# pdf(file="clothes_ClassCorrespA.pdf", width=7, height=6)
plot(rowproj[, 1:2], col = "white", xlab="", ylab="",
xlim = c(-1, 0.6), ylim = c(-0.6, 0.6))
title(main="Classical correspondence analysis of clothes data",
line=1)
text(rowproj[, 1:2], labels = rownames(S))
abline(v=0); abline(h=0)
text(colproj[, 1:2], labels = colnames(S), col="red")
facs = c(0.93,0.85,0.78,0.84,0.87)
shape::Arrows(0, 0, facs*colproj[, 1], facs*colproj[, 2],
col = "red", arr.type="simple", arr.width=0.1,
arr.length = 0.1) # , arr.adj = 0)
## NULL
title(xlab="Dimension 1", line=2.3)
title(ylab="Dimension 2", line=2.3)
# dev.off()
Now we perform a cellwise robust correspondence analysis.
# We apply MacroPCA with center fixed at zero.
# As in classical CA, we do not prescale S.
MacroPCApar0 <- list(scale = FALSE, center = rep(0,d))
MacroPCA.out <- MacroPCA(S, k=0, MacroPCApars = MacroPCApar0)
##
## The input data has 28 rows and 5 columns.
##
## Initial eigenvalues:
## 0.001858658 0.0002242119 0.000170005 7.881939e-05 5.808785e-05
##
## The cumulative percentage of explained variability
## by the first 5 components is:
## PC1 PC2 PC3 PC4 PC5
## 77.8 87.2 94.3 97.6 100.0
##
## Based on explained variability >= 80% one would select k = 2.
##
## Please use this information and the scree plot to select a value of k
## and rerun MacroPCA with it.
MacroPCA.out <- MacroPCA(S, k=2, MacroPCApars = MacroPCApar0)
##
## The input data has 28 rows and 5 columns.
##
## Initial eigenvalues:
## 0.001608485 0.0002046096 0.0001747361 7.98259e-05 5.355761e-05
##
## XciSVD$rank, Xcih.SVD$rank, k and kmax: 5 5 2 5
##
## Performed an extra reweighting step because k = 2 < rank = 5 .
##
## The final step used eigenvectors of reweighted covariance.
(eigvals <- MacroPCA.out$eigenvalues)
## [1] 0.0027284503 0.0002045247
diff(c(0,cumsum(eigvals)/sum(eigvals)))
## [1] 0.93026715 0.06973285
# Compute the principal coordinates for the biplot.
# To make the plot match the orientation in Riani et al:
V <- -MacroPCA.out$loadings
Tscores <- -MacroPCA.out$scores
sVals <- sqrt(nrow(S)*MacroPCA.out$eigenvalues)
U <- Tscores %*% diag(1/sVals)
rowproj <- diag(1/sqrt(rvec)) %*% U %*% diag(sVals)
colproj <- diag(1/sqrt(cvec)) %*% V %*% diag(sVals)
# pdf(file="clothes_MacroCA_biplot.pdf", width=7, height=6)
plot(rowproj[, 1:2], col = "white", xlim=c(-0.95,0.65),
ylim=c(-0.6,0.6), xlab="", ylab="")
title(main="Cellwise robust correspondence analysis of clothes data",
line=1)
text(rowproj[,1:2], labels = rownames(S))
abline(h=0); abline(v=0)
text(colproj[, 1:2], labels = colnames(S), col="red")
facs = c(0.9,0.8,0.5,0.75,0.85)
shape::Arrows(0, 0, facs*colproj[, 1], facs*colproj[, 2],
col = "red", arr.type="simple", arr.width=0.1,
arr.length = 0.1)
## NULL
title(xlab="Dimension 1", line=2.3)
title(ylab="Dimension 2", line=2.3)
# dev.off()
# Matches Fig 4 of Riani quite well.
We start by loading the data and constructing the matrix S.
data("data_brands")
X <- data_brands
dim(X)
## [1] 39 7
sum(X) # total count is 11713
## [1] 11713
P <- X/sum(X) # relative frequencies that add up to 1
rvec <- rowSums(P) # vector of row totals
hist(rvec) # Right tail: Chevrolet, Ford, Honda, Toyota
# These brands are well known and sold a lot in the US.
cvec <- colSums(P) # vector of column totals
R <- X / rowSums(X) # row profiles
S <- diag(sqrt(rvec)) %*% (R - rep(1, nrow(X)) %*%
t(cvec)) %*% diag(1/sqrt(cvec))
dimnames(S) <- dimnames(X)
d <- ncol(S)
Now we continue by executing DDC to identify suspicious cells.
DDC.out <- DDC(S)
##
## The input data has 39 rows and 7 columns.
ggp <- cellMap(R = DDC.out$stdResid,
mTitle = "brands data",
rowtitle = "brands",
columntitle = "perceptions")
# pdf("brands_cellmap.pdf", width = 3.5, height = 8)
plot(ggp)
# dev.off()
# Volvo is most deviating (3 cells), followed by Hyundai
# (2 cells) and Maserati (2 cells).
Now we analyze the brands data by means of classical correspondence analysis.
svd.out <- svd(S)
svd.out$d # S is singular:
## [1] 3.354798e-01 2.805778e-01 1.707355e-01 1.565453e-01 1.285439e-01
## [6] 1.044026e-01 5.623106e-17
diff(c(0,cumsum(svd.out$d^2)/sum(svd.out$d^2)))
## [1] 0.41324121 0.28905304 0.10703318 0.08998101 0.06067003 0.04002153 0.00000000
# Can be plotted in a scree plot.
# To match the plot in Riani at al:
svd.out$v[, 2] = -svd.out$v[, 2]
svd.out$u[, 2] = -svd.out$u[, 2]
rowproj <- diag(1/sqrt(rvec)) %*% svd.out$u %*% diag(svd.out$d)
colproj <- diag(1/sqrt(cvec)) %*% svd.out$v %*% diag(svd.out$d)
# pdf(file="brands_ClassCorrespA.pdf", width=7, height=6)
plot(rowproj[, 1:2], col = "white", xlim=c(-1.1,1.1),
ylim=c(-0.8,1.5), xlab="", ylab="")
title(main="Classical correspondence analysis of brands data",
line=1)
text(rowproj[,1:2], labels = rownames(S))
abline(v=0); abline(h=0)
text(colproj[, 1:2], labels = colnames(S), col="red")
facs = c(0.8,0.9,0.8,0.65,0.92,0.8,0.65)
shape::Arrows(0, 0, facs*colproj[, 1], facs*colproj[, 2],
col = "red", arr.type="simple", arr.width=0.1,
arr.length = 0.1)
## NULL
title(xlab="Dimension 1", line=2.3)
title(ylab="Dimension 2", line=2.3)
# dev.off()
Cellwise robust correspondence analysis of the brands data:
MacroPCApar0 <- list(scale = FALSE, center = rep(0, d))
MacroPCA.out <- MacroPCA(S, k = 0, MacroPCApars = MacroPCApar0)
##
## The input data has 39 rows and 7 columns.
##
## Initial eigenvalues:
## 0.001637705 0.0005991985 0.000517501 0.000261448 0.0001165996 9.420168e-05 1.323828e-05
##
## The cumulative percentage of explained variability
## by the first 7 components is:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## 50.5 69.0 85.0 93.1 96.7 99.6 100.0
##
## Based on explained variability >= 80% one would select k = 3.
##
## Please use this information and the scree plot to select a value of k
## and rerun MacroPCA with it.
MacroPCA.out <- MacroPCA(S, k = 3, MacroPCApars = MacroPCApar0)
##
## The input data has 39 rows and 7 columns.
##
## Initial eigenvalues:
## 0.001310427 0.0006177467 0.0004581388 0.0002548126 0.0001141835 0.0001013869 1.44098e-05
##
## XciSVD$rank, Xcih.SVD$rank, k and kmax: 7 7 3 7
##
## Performed an extra reweighting step because k = 3 < rank = 7 .
##
## The final step used eigenvectors of reweighted covariance.
# Computations for the biplot.
V <- MacroPCA.out$loadings
scores <- MacroPCA.out$scores
# To make the plot match the orientation in Riani et al:
V[,2] <- -V[,2]
scores[,2] <- -scores[,2]
sVals <- sqrt(nrow(S)*MacroPCA.out$eigenvalues)
U <- scores %*% diag(1/sVals)
rowproj <- diag(1/sqrt(rvec)) %*% U %*% diag(sVals)
colproj <- diag(1/sqrt(cvec)) %*% V %*% diag(sVals)
# pdf(file="brands_MacroCA_biplot.pdf", width=7, height=6)
plot(rowproj[, 1:2], col = "white", xlim=c(-1.1,1.1),
ylim=c(-0.6,0.6), xlab="", ylab="")
title(main="Cellwise robust correspondence analysis of brands data",
line=1)
text(rowproj[,1:2], labels = rownames(S))
abline(h=0); abline(v=0)
text(colproj[, 1:2], labels = colnames(S), col="red")
facs = c(0.75,0.76,0.9,0.88,0.65,0.74,0.52)
shape::Arrows(0, 0, facs*colproj[, 1], facs*colproj[, 2],
col = "red", arr.type="simple", arr.width=0.1,
arr.length = 0.1)
## NULL
title(xlab="Dimension 1", line=2.3)
title(ylab="Dimension 2", line=2.3)
# dev.off()
# Roughly matches Figure 7 of Riani et al (2022).