knitr::opts_chunk$set(warning = F,message = F,fig.width = 8,fig.height = 5)
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(lubridate))
library(autoTS)
The autoTS
package provides a high level interface for univariate time series predictions. It implements many algorithms, most of them provided by the forecast
package. The main goals of the package are :
The package is designed to work on one time series at a time. Parallel calculations can be put on top of it (see example below). The user has to provide 2 simple vectors :
lubridate
package can parse them)This package implements each algorithm with a unique parametrization, meaning that the user cannot tweak the algorithms (eg modify SARIMA specfic parameters).
For this example, we will use the GDP quarterly data of the european countries provided by eurostat. The database can be downloaded from this page and then chose “GDP and main components (output, expenditure and income) (namq_10_gdp)” and then adjust the time dimension to select all available data and download as a csv file with the correct formatting (1 234.56). The csv is in the “Data” folder of this notebook.
tmp_dir <- tempdir() %>% normalizePath()
unzip(zipfile = "../inst/extdata/namq_10_gdp.zip",exdir = tmp_dir)
dat <- read.csv(paste0(tmp_dir,"/namq_10_gdp_1_Data.csv"))
file.remove(paste0(tmp_dir,"/namq_10_gdp_1_Data.csv"),paste0(tmp_dir,"/namq_10_gdp_Label.csv"))
## [1] TRUE TRUE
str(dat)
## 'data.frame': 93456 obs. of 7 variables:
## $ TIME : chr "1975Q1" "1975Q1" "1975Q1" "1975Q1" ...
## $ GEO : chr "European Union - 27 countries (from 2019)" "European Union - 27 countries (from 2019)" "European Union - 27 countries (from 2019)" "European Union - 27 countries (from 2019)" ...
## $ UNIT : chr "Chain linked volumes, index 2010=100" "Chain linked volumes, index 2010=100" "Chain linked volumes, index 2010=100" "Chain linked volumes, index 2010=100" ...
## $ S_ADJ : chr "Unadjusted data (i.e. neither seasonally adjusted nor calendar adjusted data)" "Seasonally adjusted data, not calendar adjusted data" "Calendar adjusted data, not seasonally adjusted data" "Seasonally and calendar adjusted data" ...
## $ NA_ITEM : chr "Gross domestic product at market prices" "Gross domestic product at market prices" "Gross domestic product at market prices" "Gross domestic product at market prices" ...
## $ Value : chr ":" ":" ":" ":" ...
## $ Flag.and.Footnotes: chr "" "" "" "" ...
head(dat)
## TIME GEO
## 1 1975Q1 European Union - 27 countries (from 2019)
## 2 1975Q1 European Union - 27 countries (from 2019)
## 3 1975Q1 European Union - 27 countries (from 2019)
## 4 1975Q1 European Union - 27 countries (from 2019)
## 5 1975Q1 European Union - 27 countries (from 2019)
## 6 1975Q1 European Union - 27 countries (from 2019)
## UNIT
## 1 Chain linked volumes, index 2010=100
## 2 Chain linked volumes, index 2010=100
## 3 Chain linked volumes, index 2010=100
## 4 Chain linked volumes, index 2010=100
## 5 Current prices, million euro
## 6 Current prices, million euro
## S_ADJ
## 1 Unadjusted data (i.e. neither seasonally adjusted nor calendar adjusted data)
## 2 Seasonally adjusted data, not calendar adjusted data
## 3 Calendar adjusted data, not seasonally adjusted data
## 4 Seasonally and calendar adjusted data
## 5 Unadjusted data (i.e. neither seasonally adjusted nor calendar adjusted data)
## 6 Seasonally adjusted data, not calendar adjusted data
## NA_ITEM Value Flag.and.Footnotes
## 1 Gross domestic product at market prices :
## 2 Gross domestic product at market prices :
## 3 Gross domestic product at market prices :
## 4 Gross domestic product at market prices :
## 5 Gross domestic product at market prices :
## 6 Gross domestic product at market prices :
First, we have to clean the data (not too ugly though). First thing is to convert the TIME column into a well known date format that lubridate can handle. In this example, the yq
function can parse the date without modification of the column. Then, we have to remove the blank in the values that separates thousands…
Finally, we only keep data since 2000 and the unadjusted series in current prices.
After that, we should get one time series per country
dat <- mutate(dat,dates=yq(as.character(TIME)),
values = as.numeric(stringr::str_remove(Value," "))) %>%
filter(year(dates)>=2000 &
S_ADJ=="Unadjusted data (i.e. neither seasonally adjusted nor calendar adjusted data)" &
UNIT == "Current prices, million euro")
filter(dat,GEO %in% c("France","Austria")) %>%
ggplot(aes(dates,values,color=GEO)) + geom_line() + theme_minimal() +
labs(title="GDP of (completely) random countries")
Now we're good to go !
Let's see how to use the package on one time series :
ex1 <- filter(dat,GEO=="France")
preparedTS <- prepare.ts(ex1$dates,ex1$values,"quarter")
## What is in this new object ?
str(preparedTS)
## List of 4
## $ obj.ts : Time-Series [1:77] from 2000 to 2019: 363007 369185 362905 383489 380714 ...
## $ obj.df :'data.frame': 77 obs. of 2 variables:
## ..$ dates: Date[1:77], format: "2000-01-01" "2000-04-01" ...
## ..$ val : num [1:77] 363007 369185 362905 383489 380714 ...
## $ freq.num : num 4
## $ freq.alpha: chr "quarter"
plot.ts(preparedTS$obj.ts)
ggplot(preparedTS$obj.df,aes(dates,val)) + geom_line() + theme_minimal()
Get the best algorithm for this time series :
## What is the best model for prediction ?
best.algo <- getBestModel(ex1$dates,ex1$values,"quarter",graph = F)
names(best.algo)
## [1] "prepedTS" "best" "train.errors" "res.train" "algos"
## [6] "graph.train"
print(paste("The best algorithm is",best.algo$best))
## [1] "The best algorithm is my.ets"
best.algo$graph.train
You find in the result of this function :
The result of this function can be used as direct input of the my.prediction
function
## Build the predictions
final.pred <- my.predictions(bestmod = best.algo)
tail(final.pred,24)
## # A tibble: 24 x 4
## dates type actual.value ets
## <date> <chr> <dbl> <dbl>
## 1 2016-04-01 <NA> 560873 NA
## 2 2016-07-01 <NA> 546383 NA
## 3 2016-10-01 <NA> 572752 NA
## 4 2017-01-01 <NA> 565221 NA
## 5 2017-04-01 <NA> 573720 NA
## 6 2017-07-01 <NA> 563671 NA
## 7 2017-10-01 <NA> 592453 NA
## 8 2018-01-01 <NA> 580884 NA
## 9 2018-04-01 <NA> 586869 NA
## 10 2018-07-01 <NA> 577904 NA
## # … with 14 more rows
ggplot(final.pred) + geom_line(aes(dates,actual.value),color="black") +
geom_line(aes_string("dates",stringr::str_remove(best.algo$best,"my."),linetype="type"),color="red") +
theme_minimal()
Not too bad, right ?
Let's say we want to make a prediction for each country in the same time and be the fastest possible \(\rightarrow\) let's combine the package's functions with parallel computing. We have to reshape the data to get one column per country and then iterate over the columns of the data frame.
suppressPackageStartupMessages(library(tidyr))
dat.wide <- select(dat,GEO,dates,values) %>%
group_by(dates) %>%
spread(key = "GEO",value = "values")
head(dat.wide)
## # A tibble: 6 x 45
## # Groups: dates [6]
## dates Albania Austria Belgium `Bosnia and Her… Bulgaria Croatia Cyprus
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2000-01-01 NA 50422. 62261 NA 2941. 5266. 2547.
## 2 2000-04-01 NA 53180. 65046 NA 3252. 5811 2784.
## 3 2000-07-01 NA 53881. 62754 NA 4015. 6409. 2737.
## 4 2000-10-01 NA 56123. 68161 NA 4103. 6113 2738.
## 5 2001-01-01 NA 52911. 64318 NA 3284. 5777. 2688.
## 6 2001-04-01 NA 54994. 67537 NA 3669. 6616. 2946.
## # … with 37 more variables: Czechia <dbl>, Denmark <dbl>, Estonia <dbl>, `Euro
## # area (12 countries)` <dbl>, `Euro area (19 countries)` <dbl>, `Euro area
## # (EA11-2000, EA12-2006, EA13-2007, EA15-2008, EA16-2010, EA17-2013,
## # EA18-2014, EA19)` <dbl>, `European Union - 15 countries (1995-2004)` <dbl>,
## # `European Union - 27 countries (from 2019)` <dbl>, `European Union - 28
## # countries` <dbl>, Finland <dbl>, France <dbl>, `Germany (until 1990 former
## # territory of the FRG)` <dbl>, Greece <dbl>, Hungary <dbl>, Iceland <dbl>,
## # Ireland <dbl>, Italy <dbl>, `Kosovo (under United Nations Security Council
## # Resolution 1244/99)` <dbl>, Latvia <dbl>, Lithuania <dbl>,
## # Luxembourg <dbl>, Malta <dbl>, Montenegro <dbl>, Netherlands <dbl>, `North
## # Macedonia` <dbl>, Norway <dbl>, Poland <dbl>, Portugal <dbl>,
## # Romania <dbl>, Serbia <dbl>, Slovakia <dbl>, Slovenia <dbl>, Spain <dbl>,
## # Sweden <dbl>, Switzerland <dbl>, Turkey <dbl>, `United Kingdom` <dbl>
Note : The following code is not executed for this vignette but does work (you can try it at home)
library(doParallel)
pipeline <- function(dates,values)
{
pred <- getBestModel(dates,values,"quarter",graph = F) %>%
my.predictions()
return(pred)
}
doMC::registerDoMC(parallel::detectCores()-1) # parallel backend (for UNIX)
system.time({
res <- foreach(ii=2:ncol(dat.wide),.packages = c("dplyr","autoTS")) %dopar%
pipeline(dat.wide$dates,pull(dat.wide,ii))
})
names(res) <- colnames(dat.wide)[-1]
str(res)
There is no best algorithm in general \(\Rightarrow\) depends on the data ! Likewise, this is not executed in this vignette, but works if you want to replicate it.
sapply(res,function(xx) colnames(select(xx,-dates,-type,-actual.value)) ) %>% table()
sapply(res,function(xx) colnames(select(xx,-dates,-type,-actual.value)) )