# Synthetic grid dataset with jitter
set.seed(1002)
grid_xy <- expand.grid(x1 = 1:30, x2 = 1:30)
X_synthetic <- data.frame(
x1 = grid_xy$x1 + rnorm(nrow(grid_xy), 0, 0.3),
x2 = grid_xy$x2 + rnorm(nrow(grid_xy), 0, 0.3)
)
kms_synthetic <- naes(X = X_synthetic, k = 40, iter.max = 100)
# NIRsoil dataset in PC space
kms <- naes(X = NIRsoil$spc, k = 5, pc = 2, iter.max = 100)
par(mfrow = c(1, 2))
plot(
X_synthetic,
col = rgb(0, 0, 0, 0.3), pch = 19,
main = "k-means (synthetic)"
)
grid()
points(X_synthetic[kms_synthetic$model, ], col = "red", pch = 19)
legend(
x = "top",
inset = c(0, -0.15),
xpd = TRUE,
horiz = TRUE,
bty = "n",
legend = c("All samples", "Selected"),
pch = 19,
col = c(rgb(0, 0, 0, 0.3), "red"),
bg = rgb(1, 1, 1, 0.8)
)
plot(
kms$pc,
col = rgb(0, 0, 0, 0.3), pch = 19,
main = "k-means (NIRsoil)"
)
grid()
points(kms$pc[kms$model, ], col = "red", pch = 19)
legend(
x = "top",
inset = c(0, -0.15),
xpd = TRUE,
horiz = TRUE,
bty = "n",
legend = c("All samples", "Selected"),
pch = 19,
col = c(rgb(0, 0, 0, 0.3), "red"),
bg = rgb(1, 1, 1, 0.8)
)
par(mfrow = c(1, 1))
The algorithm produces a calibration set with uniform coverage of the spectral space. The distance metric can be either Euclidean or Mahalanobis. A known limitation is that the algorithm is prone to selecting outliers (Ramirez-Lopez et al., 2014); outlier screening before sample selection is therefore recommended.
# Synthetic dataset: grid with jitter
grid_xy <- expand.grid(x1 = 1:30, x2 = 1:30)
set.seed(1014)
X_synthetic <- data.frame(
x1 = grid_xy$x1 + rnorm(nrow(grid_xy), 0, 0.3),
x2 = grid_xy$x2 + rnorm(nrow(grid_xy), 0, 0.3)
)
ken <- kenStone(X_synthetic, k = 40)
# NIRsoil dataset — Mahalanobis distance in PC space
ken_mahal <- kenStone(X = NIRsoil$spc, k = 20, metric = "mahal", pc = 2)
par(mfrow = c(1, 2))
plot(
X_synthetic,
col = rgb(0, 0, 0, 0.3), pch = 19,
main = "Kennard-Stone (synthetic)"
)
grid()
points(X_synthetic[ken$model, ], col = "red", pch = 19, cex = 1.2)
legend(
x = "top",
inset = c(0, -0.15),
xpd = TRUE,
horiz = TRUE,
bty = "n",
legend = c("All samples", "Selected"),
pch = 19,
col = c(rgb(0, 0, 0, 0.3), "red"),
bg = rgb(1, 1, 1, 0.8)
)
plot(
ken_mahal$pc[, 1], ken_mahal$pc[, 2],
col = rgb(0, 0, 0, 0.3), pch = 19,
xlab = "PC1", ylab = "PC2",
main = "Kennard-Stone (NIRsoil)"
)
grid()
points(
ken_mahal$pc[ken_mahal$model, 1],
ken_mahal$pc[ken_mahal$model, 2],
pch = 19, col = "red"
)
legend(
x = "top",
inset = c(0, -0.15),
xpd = TRUE,
horiz = TRUE,
bty = "n",
legend = c("All samples", "Selected"),
pch = 19,
col = c(rgb(0, 0, 0, 0.3), "red"),
bg = rgb(1, 1, 1, 0.8)
)
par(mfrow = c(1, 1))
plot(
ken_mahal_init$pc[, 1], ken_mahal_init$pc[, 2],
col = rgb(0, 0, 0, 0.3), pch = 19,
xlab = "PC1", ylab = "PC2",
main = "Kennard-Stone with initialisation"
)
grid()
points(
ken_mahal_init$pc[ken_mahal_init$model, 1],
ken_mahal_init$pc[ken_mahal_init$model, 2],
pch = 19, col = "red"
)
points(
ken_mahal_init$pc[initialization_ind, 1],
ken_mahal_init$pc[initialization_ind, 2],
pch = 19, cex = 1.5, col = "dodgerblue"
)
legend(
x = "top",
inset = c(0, -0.15),
xpd = TRUE,
horiz = TRUE,
bty = "n",
legend = c("All samples", "Selected", "Initialisation"),
pch = 19,
col = c(rgb(0, 0, 0, 0.3), "red", "dodgerblue"),
bg = rgb(1, 1, 1, 0.8)
)
dup <- duplex(X = X_synthetic, k = 15)
dup_nir <- duplex(X = NIRsoil$spc, k = 20, metric = "mahal", pc = 2)
par(mfrow = c(1, 2))
plot(
X_synthetic,
col = rgb(0, 0, 0, 0.3), pch = 19,
main = "DUPLEX (synthetic)"
)
grid()
points(X_synthetic[dup$model, ], col = "red", pch = 19)
points(X_synthetic[dup$test, ], col = "dodgerblue", pch = 19)
legend(
x = "top",
inset = c(0, -0.15),
xpd = TRUE,
horiz = TRUE,
bty = "n",
legend = c("All samples", "Calibration", "Validation"),
pch = 19,
col = c(rgb(0, 0, 0, 0.3), "red", "dodgerblue")
)
plot(
dup_nir$pc[, 1], dup_nir$pc[, 2],
col = rgb(0, 0, 0, 0.3), pch = 19,
xlab = "PC1", ylab = "PC2",
main = "DUPLEX (NIRsoil)"
)
grid()
points(dup_nir$pc[dup_nir$model, 1], dup_nir$pc[dup_nir$model, 2],
col = "red", pch = 19)
points(dup_nir$pc[dup_nir$test, 1], dup_nir$pc[dup_nir$test, 2],
col = "dodgerblue", pch = 19)
legend(
x = "top",
inset = c(0, -0.15),
xpd = TRUE,
horiz = TRUE,
bty = "n",
legend = c("All samples", "Calibration", "Validation"),
pch = 19,
col = c(rgb(0, 0, 0, 0.3), "red", "dodgerblue")
)
par(mfrow = c(1, 1))
shenk_synthetic <- shenkWest(X = X_synthetic, d.min = 0.1)
shenk <- shenkWest(X = NIRsoil$spc, d.min = 0.6, pc = 2)
par(mfrow = c(1, 2), mar = c(5, 4, 6, 2))
plot(
X_synthetic,
col = rgb(0, 0, 0, 0.3), pch = 19,
main = "SELECT (synthetic)"
)
grid()
points(X_synthetic[shenk_synthetic$model, ], col = "red", pch = 19)
legend(
x = "top",
inset = c(0, -0.15),
xpd = TRUE,
horiz = TRUE,
bty = "n",
legend = c("All samples", "Selected"),
pch = 19,
col = c(rgb(0, 0, 0, 0.3), "red")
)
plot(
shenk$pc,
col = rgb(0, 0, 0, 0.3), pch = 19,
xlab = "PC1", ylab = "PC2",
main = "SELECT (NIRsoil)"
)
grid()
points(shenk$pc[shenk$model, ], col = "red", pch = 19)
legend(
x = "top",
inset = c(0, -0.15),
xpd = TRUE,
horiz = TRUE,
bty = "n",
legend = c("All samples", "Selected"),
pch = 19,
col = c(rgb(0, 0, 0, 0.3), "red")
)
par(mfrow = c(1, 1))
pu_synthetic <- puchwein(X = X_synthetic, k = 0.2)
pu <- puchwein(X = NIRsoil$spc, k = 0.2, pc = 2)
par(mfrow = c(1, 2), mar = c(5, 4, 6, 2))
plot(
X_synthetic,
col = rgb(0, 0, 0, 0.3), pch = 19,
main = "Puchwein (synthetic)"
)
grid()
points(X_synthetic[pu_synthetic$model, ], col = "red", pch = 19)
legend(
x = "top",
inset = c(0, -0.15),
xpd = TRUE,
horiz = TRUE,
bty = "n",
legend = c("All samples", "Selected"),
pch = 19,
col = c(rgb(0, 0, 0, 0.3), "red")
)
plot(
pu$pc,
col = rgb(0, 0, 0, 0.3), pch = 19,
xlab = "PC1", ylab = "PC2",
main = "Puchwein (NIRsoil)"
)
grid()
points(pu$pc[pu$model, ], col = "red", pch = 19)
legend(
x = "top",
inset = c(0, -0.15),
xpd = TRUE,
horiz = TRUE,
bty = "n",
legend = c("All samples", "Selected"),
pch = 19,
col = c(rgb(0, 0, 0, 0.3), "red")
)
par(mfrow = c(1, 1))
A distinctive feature of the Puchwein algorithm is the leverage diagnostic, which helps identify the optimal number of loops (and hence calibration samples). Figure 7 shows the difference between the theoretical and observed sum of leverages as a function of samples removed, and the number of samples retained per loop.
ho <- honigs(X = NIRsoil$spc, k = 10, type = "A")
wav <- as.numeric(colnames(NIRsoil$spc))
par(mfrow = c(1, 2), mar = c(5, 4, 6, 2))
matplot(
wav, t(NIRsoil$spc),
type = "l", lty = 1, lwd = 0.5,
col = rgb(0, 0, 0, 0.1),
xlab = "Wavelength (nm)", ylab = "Absorbance",
main = "All spectra"
)
matlines(wav, t(NIRsoil$spc[ho$model, ]), lty = 1, lwd = 1.5, col = "red")
legend(
x = "top",
inset = c(0, -0.15),
xpd = TRUE,
horiz = TRUE,
bty = "n",
legend = c("All spectra", "Selected"),
lty = 1,
col = c(rgb(0, 0, 0, 0.3), "red")
)
matplot(
wav, t(NIRsoil$spc[ho$model, ]),
type = "l", lty = 1, lwd = 1.5,
xlab = "Wavelength (nm)", ylab = "Absorbance",
main = "Selected spectra"
)
abline(v = wav[ho$bands], lty = 2, col = "grey40")
grid()
legend(
x = "top",
inset = c(0, -0.15),
xpd = TRUE,
horiz = TRUE,
bty = "n",
legend = c("Selected spectra", "Bands used"),
lty = c(1, 2),
col = c("black", "grey40")
)
par(mfrow = c(1, 1))
Deming, W.E., 1986. Out of the crisis. MIT Center for Advanced Engineering Study, Cambridge, MA.
Fernandez-Pierna, J.A., Dardenne, P., 2008. Soil parameter quantification by NIRS as a chemometric challenge at “chimiométrie 2006.” Chemometrics and intelligent laboratory systems 91, 94–98.
Honigs, D., Hieftje, G.M., Mark, H., Hirschfeld, T., 1985. Unique-sample selection via near-infrared spectral subtraction. Analytical Chemistry 57, 2299–2303.
Kennard, R.W., Stone, L.A., 1969. Computer aided design of experiments. Technometrics 11, 137–148.
Naes, T., Isaksson, T., Fearn, T., Davies, T., 2002. Outlier detection. A user-friendly guide to multivariate calibration and classification.
Puchwein, G., 1988. Selection of calibration samples for near-infrared spectrometry by factor analysis of spectra. Analytical Chemistry 60, 569–573.
Ramirez-Lopez, L., Schmidt, K., Behrens, T., Van Wesemael, B., Demattê, J.A., Scholten, T., 2014. Sampling optimal calibration sets in soil infrared spectroscopy. Geoderma 226, 140–150.
Shenk, J., Westerhaus, M., 1991. Population definition, sample selection, and calibration procedures for near infrared reflectance spectroscopy. Crop science 31, 469–474.
Snee, R.D., 1977. Validation of regression models: Methods and examples. Technometrics 19, 415–428.