library(MetricsWeighted)
{MetricsWeighted} provides weighted versions of different machine learning metrics and performance measures.
# From CRAN
install.packages("MetricsWeighted")
# Development version
::install_github("mayer79/MetricsWeighted") devtools
Currently, the following metrics and performance measures are available:
accuracy()
, recall()
,
precision()
, f1_score()
, and
classification_error()
: Typical binary performance measures
derived from the confusion matrix, see (https://en.wikipedia.org/wiki/Precision_and_recall).
Require binary predictions. Except for classification error, higher
values are better.
AUC()
and gini_coefficient()
: Area
under the receiver operating curve (ROC) and the closely related Gini
coefficient. Actual values must be 0 or 1, while predictions can take
any value (only their order is relevant). The higher, the
better.
deviance_bernoulli()
and logLoss()
:
Further metrics relevant for binary targets, namely the average unit
deviance of the binary logistic regression model (0-1 response)
and log loss (half that deviance). As with all deviance measures,
smaller values are better.
mse()
, mae()
, mape()
,
rmse()
, and medae()
: Typical regression
metrics (mean-squared error, mean absolute error, mean absolute
percentage error, root-mean-squared error, and median absolute error).
The lower, the better.
deviance_tweedie()
: (Unscaled) average unit Tweedie
deviance with parameter tweedie_p
, see (Jorgensen 1997) and (https://en.wikipedia.org/wiki/Tweedie_distribution) for
a reference of the underlying formula.
deviance_normal()
, deviance_gamma()
,
and deviance_poisson()
: Special cases of Tweedie.
deviance_normal()
equals mean-squared error.
elementary_score_quantile()
and
elementary_score_expectile()
: Consistent scoring functions
for quantiles and expectiles, see (Ehm et al.
2016).
prop_within()
: Proportion of predicted values within
a given band around the true values.
They all take at least four arguments:
actual
: Actual observed values.
predicted
: Predicted values.
w
: Optional vector with case weights.
...
: Further arguments.
# The data
<- iris[["Sepal.Length"]]
y_num <- lm(Sepal.Length ~ ., data = iris)
fit_num <- fit_num$fitted
pred_num <- seq_len(nrow(iris))
weights
# Performance metrics
mae(y_num, pred_num) # unweighted
#> [1] 0.2428628
mae(y_num, pred_num, w = rep(1, length(y_num))) # same
#> [1] 0.2428628
mae(y_num, pred_num, w = weights) # different
#> [1] 0.2561237
rmse(y_num, pred_num)
#> [1] 0.300627
medae(y_num, pred_num, w = weights) # median absolute error
#> [1] 0.2381186
# Normal deviance equals Tweedie deviance with parameter 0
deviance_normal(y_num, pred_num)
#> [1] 0.09037657
deviance_tweedie(y_num, pred_num, tweedie_p = 0)
#> [1] 0.09037657
deviance_tweedie(y_num, pred_num, tweedie_p = -0.001)
#> [1] 0.09053778
# Poisson deviance equals Tweedie deviance with parameter 1
deviance_poisson(y_num, pred_num)
#> [1] 0.01531595
deviance_tweedie(y_num, pred_num, tweedie_p = 1)
#> [1] 0.01531595
deviance_tweedie(y_num, pred_num, tweedie_p = 1.01)
#> [1] 0.01504756
# Gamma deviance equals Tweedie deviance with parameter 2
deviance_gamma(y_num, pred_num)
#> [1] 0.002633186
deviance_tweedie(y_num, pred_num, tweedie_p = 2)
#> [1] 0.002633186
deviance_tweedie(y_num, pred_num, tweedie_p = 1.99)
#> [1] 0.002679764
deviance_tweedie(y_num, pred_num, tweedie_p = 2.01)
#> [1] 0.00258742
# The data
<- iris[["Species"]] == "setosa"
y_cat <- glm(y_cat ~ Sepal.Length, data = iris, family = binomial())
fit_cat <- predict(fit_cat, type = "response")
pred_cat
# Performance metrics
AUC(y_cat, pred_cat) # unweighted
#> [1] 0.9586
AUC(y_cat, pred_cat, w = weights) # weighted
#> [1] 0.9629734
logLoss(y_cat, pred_cat) # Log loss
#> [1] 0.2394547
deviance_bernoulli(y_cat, pred_cat) # Log Loss * 2
#> [1] 0.4789093
Furthermore, we provide a generalization of R-squared, defined as the proportion of deviance explained, i.e., one minus the ratio of residual deviance and intercept-only deviance, see (Cohen 2003).
For out-of-sample calculations, the null deviance is ideally
calculated from the average in the training data. This can be controlled
by setting reference_mean
to the (possibly weighted)
average in the training data.
summary(fit_num)$r.squared
#> [1] 0.8673123
# Same
r_squared(y_num, pred_num)
#> [1] 0.8673123
r_squared(y_num, pred_num, deviance_function = deviance_tweedie, tweedie_p = 0)
#> [1] 0.8673123
r_squared(y_num, pred_num, deviance_function = deviance_tweedie, tweedie_p = 1.5)
#> [1] 0.8675195
# Weighted
r_squared(y_num, pred_num, w = weights)
#> [1] 0.8300011
r_squared(y_num, pred_num, w = weights, deviance_function = deviance_gamma)
#> [1] 0.8300644
r_squared(
w = weights, deviance_function = deviance_tweedie, tweedie_p = 2
y_num, pred_num,
)#> [1] 0.8300644
r_squared(y_num, pred_num, deviance_function = deviance_tweedie, tweedie_p = 1.5)
#> [1] 0.8675195
# With respect to 'own' deviance formula
<- function(actual, predicted, w = NULL, ...) {
myTweedie deviance_tweedie(actual, predicted, w, tweedie_p = 1.5, ...)
}r_squared(y_num, pred_num, deviance_function = myTweedie)
#> [1] 0.8675195
In order to facilitate the use of these metrics in a {dplyr} chain,
use the function performance()
: Starting from a data set
with actual and predicted values (and optional case weights), it
calculates one or more metrics. The resulting values are returned as a
data.frame
.
library(dplyr)
<- lm(Sepal.Length ~ ., data = iris)
fit_num
# Regression with `Sepal.Length` as response
%>%
iris mutate(pred = predict(fit_num, data = .)) %>%
performance("Sepal.Length", "pred")
# Same
%>%
iris mutate(pred = predict(fit_num, data = .)) %>%
performance("Sepal.Length", "pred", metrics = rmse)
# Grouped by Species
%>%
iris mutate(pred = predict(fit_num, data = .)) %>%
group_by(Species) %>%
do(performance(., actual = "Sepal.Length", predicted = "pred"))
# Customized output
%>%
iris mutate(pred = predict(fit_num, data = .)) %>%
performance(
"Sepal.Length",
"pred",
value = "performance",
metrics = list(`root-mean-squared error` = rmse)
)
# Multiple measures
%>%
iris mutate(pred = predict(fit_num, data = .)) %>%
performance(
"Sepal.Length",
"pred",
metrics = list(rmse = rmse, mae = mae, `R-squared` = r_squared)
)
# Grouped by Species
%>%
iris mutate(pred = predict(fit_num, data = .)) %>%
group_by(Species) %>%
do(
performance(
., "Sepal.Length",
"pred",
metrics = list(rmse = rmse, mae = mae, `R-squared` = r_squared)
)
)
# Passing extra argument (Tweedie p)
%>%
iris mutate(pred = predict(fit_num, data = .)) %>%
performance(
"Sepal.Length",
"pred",
metrics = list(
`normal deviance` = deviance_normal, `Tweedie with p = 0` = deviance_tweedie
),tweedie_p = 0
)
Some scoring functions depend on a further parameter \(p\):
tweedie_deviance()
: depends on
tweedie_p
,elementary_score_expectile()
,
elementary_score_quantile()
: depend on
theta
.prop_within()
: Depends on tol
.It might be of key relevance to evaluate such function for varying
\(p\). That is where the function
multi_metric()
shines.
<- iris
ir $pred <- predict(fit_num, data = ir)
ir
# Create multiple Tweedie deviance functions
<- multi_metric(deviance_tweedie, tweedie_p = c(0, seq(1, 3, by = 0.2)))
multi_Tweedie <- performance(
perf
ir, actual = "Sepal.Length",
predicted = "pred",
metrics = multi_Tweedie,
key = "Tweedie p",
value = "deviance"
)$`Tweedie p` <- as.numeric(as.character(perf$`Tweedie p`))
perfhead(perf)
#> Tweedie p deviance
#> 1 0.0 0.090376567
#> 2 1.0 0.015315945
#> 3 1.2 0.010757362
#> 4 1.4 0.007559956
#> 5 1.6 0.005316008
#> 6 1.8 0.003740296
# Deviance vs p
plot(deviance ~ `Tweedie p`, data = perf, type = "s")
# Same for Pseudo-R-Squared regarding Tweedie deviance
<- multi_metric(
multi_Tweedie_r2 deviance_function = deviance_tweedie, tweedie_p = c(0, seq(1, 3, by = 0.2))
r_squared,
)<- performance(
perf
ir, actual = "Sepal.Length",
predicted = "pred",
metrics = multi_Tweedie_r2,
key = "Tweedie p",
value = "R-squared"
)$`Tweedie p` <- as.numeric(as.character(perf$`Tweedie p`))
perf
# Values vs. p
plot(`R-squared` ~ `Tweedie p`, data = perf, type = "s")
The same logic as in the last example can be used to create so-called
Murphy diagrams (Ehm et al. 2016). The function
murphy_diagram()
wraps above calls and allows to get
elementary scores for one or multiple models across a range of theta
values, see also R package murphydiagram.
<- 1:10
y <- cbind(m1 = 1.1 * y, m2 = 1.2 * y)
two_models murphy_diagram(y, two_models, theta = seq(0.9, 1.3, by = 0.01))