## -----------------------------------------------------------------------------
#| label: tbl-preprocessing
#| tbl-cap: "Pre-processing functions available in the package."
#| echo: false

library(prospectr)

preprocessing_fns <- data.frame(
  `Function` = c(
    "`movav`", "`savitzkyGolay`", "`gapDer`", "`baseline`",
    "`continuumRemoval`", "`detrend`", "`standardNormalVariate`",
    "`msc`", "`binning`", "`resample`", "`resample2`",
    "`blockScale`", "`blockNorm`"
  ),
  `Description` = c(
    "Simple moving (or running) average filter",
    "Savitzky--Golay smoothing and derivative",
    "Gap--segment derivative",
    "Baseline removal (similar to `continuumRemoval`)",
    "Computes continuum--removed values",
    "Detrend normalisation",
    "Standard Normal Variate (SNV) transformation",
    "Multiplicative Scatter Correction",
    "Average a signal in column bins",
    "Resample a signal to new band positions",
    "Resample a signal using new FWHM values",
    "Block scaling",
    "Sum of squares block weighting"
  ),
  check.names = FALSE
)

knitr::kable(preprocessing_fns)


## -----------------------------------------------------------------------------
#| label: load-NIRsoil
#| message: false

library(prospectr)
data(NIRsoil)
# NIRsoil is a data.frame with 825 observations and 5 variables:
# Nt (Total Nitrogen), Ciso (Carbon), CEC (Cation Exchange Capacity),
# train (0/1 indicator for training (1) and validation (0) samples),
# spc (spectral matrix)
str(NIRsoil)


## -----------------------------------------------------------------------------
#| label: fig-movav
#| fig-cap: "Effect of a moving average with window size of 11 bands on a raw spectrum."
#| fig-height: 4
#| fig-width: 6
#| echo: true

# Add some noise
noisy <- NIRsoil$spc + rnorm(length(NIRsoil$spc), 0, 0.001)

# Plot the first spectrum
plot(
  x = as.numeric(colnames(NIRsoil$spc)),
  y = noisy[1, ],
  type = "l",
  lwd = 1.5,
  xlab = "Wavelength",
  ylab = "Absorbance"
)

# Window size of 11 bands; note the 5 first and last bands are lost
X <- movav(X = noisy, w = 11)
lines(x = as.numeric(colnames(X)), y = X[1, ], lwd = 1.5, col = "red")
grid()
legend(
  "topleft",
  legend = c("Raw", "Moving average"),
  lty = 1,
  col = c("black", "red")
)


## -----------------------------------------------------------------------------
#| label: savitzky-golay
#| echo: true

# p = polynomial order
# w = window size (must be odd)
# m = m-th derivative (0 = smoothing)
# Accepts vectors, data frames, or matrices (observations row-wise)
sgvec <- savitzkyGolay(X = NIRsoil$spc[1, ], p = 3, w = 11, m = 0)
sg <- savitzkyGolay(X = NIRsoil$spc, p = 3, w = 11, m = 0)

# Bands at the edges of the spectral matrix are lost
dim(NIRsoil$spc)
dim(sg)


## -----------------------------------------------------------------------------
#| label: tbl-derivatives
#| tbl-cap: "Advantages and drawbacks of derivative spectra."
#| echo: false

der_table <- data.frame(
  Advantage = c(
    "Reduces baseline offset",
    "Can resolve overlapping absorptions",
    "Compensates for instrumental drift",
    "Enhances small spectral absorptions",
    "Often increases predictive accuracy for complex datasets"
  ),
  Drawback = c(
    "Risk of overfitting the calibration model",
    "Increases noise; smoothing required",
    "Increases uncertainty in model coefficients",
    "Complicates spectral interpretation",
    "Removes the baseline"
  )
)

knitr::kable(der_table)


## -----------------------------------------------------------------------------
#| label: fig-derivatives
#| fig-cap: "First and second derivatives of a raw spectrum."
#| fig-height: 4
#| fig-width: 6
#| echo: true

d1 <- t(diff(t(NIRsoil$spc), differences = 1))  # first derivative
d2 <- t(diff(t(NIRsoil$spc), differences = 2))  # second derivative

plot(
  as.numeric(colnames(d1)), d1[1, ],
  type = "l", lwd = 1.5,
  xlab = "Wavelength", ylab = ""
)
lines(as.numeric(colnames(d2)), d2[1, ], lwd = 1.5, col = "red")
grid()
legend(
  "topleft",
  legend = c("1st derivative", "2nd derivative"),
  lty = 1, col = c("black", "red")
)


## -----------------------------------------------------------------------------
#| label: fig-gap-derivative
#| fig-cap: "Comparison of first derivative (finite difference) and
#|   gap derivative (lag = 10 bands) for the first spectrum."
#| fig-height: 4
#| fig-width: 6
#| echo: true

# m = derivative order; w = gap size; s = segment size
gsd1 <- gapDer(X = NIRsoil$spc, m = 1, w = 11, s = 5)

plot(
  as.numeric(colnames(d1)), d1[1, ],
  type = "l", lwd = 1.5,
  xlab = "Wavelength (nm)", ylab = "1st derivative"
)
par(new = TRUE)
plot(
  as.numeric(colnames(gsd1)), gsd1[1, ],
  type = "l", lwd = 1.5, col = "red",
  xaxt = "n", yaxt = "n",
  xlab = "", ylab = ""
)
axis(4, col = "red")
mtext("Gap-segment derivative", side = 4, line = 3, col = "red")
grid()
legend(
  "topleft",
  legend = c("1st derivative", "Gap-segment 1st derivative"),
  lty = 1, col = c("black", "red")
)
par(new = FALSE)


## -----------------------------------------------------------------------------
#| label: fig-snv
#| fig-cap: "Raw absorbance spectra (left) and SNV-transformed spectra (right)
#|   for the first five observations."
#| fig-height: 4
#| fig-width: 8
#| echo: true

snv <- standardNormalVariate(X = NIRsoil$spc)
wav <- as.numeric(colnames(NIRsoil$spc))

par(mfrow = c(1, 2))

matplot(
  wav, t(NIRsoil$spc[1:5, ]),
  type = "l", lty = 1, lwd = 1.5,
  col = c("black", "grey20", "grey40", "grey60", "grey80"),
  xlab = "Wavelength (nm)", ylab = "Absorbance",
  main = "Raw"
)
grid()

matplot(
  wav, t(snv[1:5, ]),
  type = "l", lty = 1, lwd = 1.5,
  col = c("red", "tomato", "coral", "salmon", "lightsalmon"),
  xlab = "Wavelength (nm)", ylab = "SNV (Absorbance)",
  main = "SNV"
)
grid()

par(mfrow = c(1, 1))


## -----------------------------------------------------------------------------
#| label: fig-msc
#| fig-cap: "Effect of MSC on a raw spectrum (reference = mean spectrum)."
#| fig-height: 4
#| fig-width: 6
#| echo: true

# Reference spectrum is the column mean of the full matrix
msc_spc <- msc(X = NIRsoil$spc, ref_spectrum = colMeans(NIRsoil$spc))

plot(
  as.numeric(colnames(NIRsoil$spc)), NIRsoil$spc[1, ],
  type = "l", lwd = 1.5,
  xlab = "Wavelength (nm)", ylab = "Absorbance"
)
lines(as.numeric(colnames(msc_spc)), msc_spc[1, ], lwd = 1.5, col = "red")
grid()
legend(
  "topleft",
  legend = c("Raw", "MSC"),
  lty = 1, col = c("black", "red")
)


## -----------------------------------------------------------------------------
#| label: msc-transfer
#| eval: false
#| echo: true

# # Training and validation partitions
# Xr <- NIRsoil$spc[NIRsoil$train == 1, ]
# Xu <- NIRsoil$spc[NIRsoil$train == 0, ]
# 
# # Fit MSC on the training set (reference = mean of Xr by default)
# Xr_msc <- msc(Xr)
# 
# # Apply the same reference spectrum to the validation set
# Xu_msc <- msc(Xu, ref_spectrum = attr(Xr_msc, "Reference spectrum"))


## -----------------------------------------------------------------------------
#| label: fig-detrend
#| fig-cap: "Effect of SNV-Detrend on a raw spectrum. Note the different
#|   y-axis scales: raw absorbance (left) and detrended signal (right)."
#| fig-height: 4
#| fig-width: 6
#| echo: true

wav <- as.numeric(colnames(NIRsoil$spc))
dt  <- detrend(X = NIRsoil$spc, wav = wav)

# Dual y-axis: raw and detrended spectra are on different scales
plot(
  wav, NIRsoil$spc[1, ],
  type = "l", lwd = 1.5,
  xlab = "Wavelength (nm)", ylab = "Absorbance"
)
par(new = TRUE)
plot(
  wav, dt[1, ],
  type = "l", lwd = 1.5, col = "red",
  xaxt = "n", yaxt = "n",
  xlab = "", ylab = ""
)
axis(4, col = "red")
mtext("Detrended", side = 4, line = 3, col = "red")
grid()
legend(
  "topleft",
  legend = c("Raw", "SNV-Detrend"),
  lty = 1, col = c("black", "red")
)
par(new = FALSE)


## -----------------------------------------------------------------------------
#| label: fig-baseline
#| fig-cap: "Raw absorbance spectra (left) and baseline-corrected spectra (right)
#| for the first three observations."
#| fig-height: 4
#| fig-width: 8
#| echo: true

wav <- as.numeric(colnames(NIRsoil$spc))
bs <- baseline(NIRsoil$spc, wav)
fitted_baselines <- attr(bs, "baselines")

par(mfrow = c(1, 2))

# Raw spectra with fitted baselines
matplot(
  wav, t(NIRsoil$spc[1:3, ]),
  type = "l", lty = 1, lwd = 1.5,
  col = c("black", "grey40", "grey70"),
  xlab = "Wavelength (nm)", ylab = "Absorbance",
  main = "Raw and baseline"
)
matlines(wav, t(fitted_baselines[1:3, ]), lty = 2, col = c("black", "grey40", "grey70"))
grid()

# Baseline-corrected spectra
matplot(
  wav, t(bs[1:3, ]),
  type = "l", lty = 1, lwd = 1.5,
  col = c("red", "tomato", "salmon"),
  xlab = "Wavelength (nm)", ylab = "Absorbance",
  main = "Baseline corrected"
)
grid()

par(mfrow = c(1, 1))


## -----------------------------------------------------------------------------
#| label: block-scale
#| echo: true

# type = "soft" or "hard"
# Returns a list with the scaled matrix (Xscaled) and the divisor (f)
bsc <- blockScale(X = NIRsoil$spc, type = "hard")$Xscaled
sum(apply(bsc, 2, var))  # should equal 1


## -----------------------------------------------------------------------------
#| label: block-norm
#| echo: true

# targetnorm = desired Frobenius norm (sum of squares) for X
bn <- blockNorm(X = NIRsoil$spc, targetnorm = 1)$Xscaled
sum(bn^2)  # should equal 1


## -----------------------------------------------------------------------------
#| label: fig-resample
#| fig-cap: "Original NIR spectrum and resampled spectrum at a coarser
#|   resolution (every 10 nm) using interpolation."
#| fig-height: 4
#| fig-width: 6
#| echo: true

wav <- as.numeric(colnames(NIRsoil$spc))
wav_coarse <- seq(min(wav), max(wav), by = 10)

rs <- resample(X = NIRsoil$spc, wav = wav, new.wav = wav_coarse)

plot(
  wav, NIRsoil$spc[1, ],
  type = "l", lwd = 1.5,
  xlab = "Wavelength (nm)", ylab = "Absorbance"
)
lines(
  wav_coarse, rs[1, ], lwd = 1.5, col = "red", type = "b", pch = 19, cex = 0.5
)
grid()
legend(
  "topleft",
  legend = c("Original", "Resampled (10 nm)"),
  lty = 1, col = c("black", "red")
)


## -----------------------------------------------------------------------------
#| label: fig-continuum-removal
#| fig-cap: "Raw absorbance spectra (left) and continuum-removed spectra (right)
#|   for the first three observations."
#| fig-height: 4
#| fig-width: 8
#| echo: true

# type = "A" for absorbance, "R" for reflectance (default)
cr  <- continuumRemoval(X = NIRsoil$spc, type = "A")
wav <- as.numeric(colnames(NIRsoil$spc))

par(mfrow = c(1, 2))

matplot(
  wav, t(NIRsoil$spc[1:3, ]),
  type = "l", lty = 1, lwd = 1.5,
  col = c("black", "grey40", "grey70"),
  xlab = "Wavelength (nm)", ylab = "Absorbance",
  main = "Raw"
)
grid()

matplot(
  wav, t(cr[1:3, ]),
  type = "l", lty = 1, lwd = 1.5,
  col = c("red", "tomato", "salmon"),
  xlab = "Wavelength (nm)", ylab = "Continuum-removed",
  main = "Continuum removal"
)
grid()

par(mfrow = c(1, 1))

