This vignette for the unpivotr package demonstrates unpivoting html tables of various kinds.
The HTML files are in the package directory at
system.file("extdata", c("rowspan.html", "colspan.html", "nested.html"), package = "unpivotr")
.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(rvest)
library(htmltools)
library(unpivotr)
If a table has cells merged across rows or columns (or both), then
as_cells()
does not attempt to fill the cell contents
across the rows or columns. This is different from other packages,
e.g. rvest
. However, if merged cells cause a table not to
be square, then as_cells()
pads the missing cells with
blanks.
<- system.file("extdata", "rowspan.html", package = "unpivotr")
rowspan includeHTML(rowspan)
Header (1:2, 1) | Header (1, 2) |
---|---|
cell (2, 2) |
# rvest
%>%
rowspan read_html() %>%
html_table()
## [[1]]
## # A tibble: 1 × 2
## `Header (1:2, 1)` `Header (1, 2)`
## <chr> <chr>
## 1 Header (1:2, 1) cell (2, 2)
# unpivotr
%>%
rowspan read_html() %>%
as_cells()
## [[1]]
## # A tibble: 4 × 4
## row col data_type html
## <int> <int> <chr> <chr>
## 1 1 1 html "<th rowspan=\"2\">Header (1:2, 1)</th>"
## 2 2 1 html <NA>
## 3 1 2 html "<th>Header (1, 2)</th>"
## 4 2 2 html "<td>cell (2, 2)</td>"
<- system.file("extdata", "colspan.html", package = "unpivotr")
colspan includeHTML(colspan)
Header (1, 1:2) | |
---|---|
cell (2, 1) | cell (2, 2) |
# rvest
%>%
colspan read_html() %>%
html_table()
## [[1]]
## # A tibble: 1 × 2
## `Header (1, 1:2)` `Header (1, 1:2)`
## <chr> <chr>
## 1 cell (2, 1) cell (2, 2)
# unpivotr
%>%
colspan read_html() %>%
as_cells()
## [[1]]
## # A tibble: 4 × 4
## row col data_type html
## <int> <int> <chr> <chr>
## 1 1 1 html "<th colspan=\"2\">Header (1, 1:2)</th>"
## 2 2 1 html "<td>cell (2, 1)</td>"
## 3 1 2 html <NA>
## 4 2 2 html "<td>cell (2, 2)</td>"
<- system.file("extdata",
rowandcolspan "row-and-colspan.html",
package = "unpivotr")
includeHTML(rowandcolspan)
Header (1:2, 1:2) | Header (2, 3) | |
---|---|---|
cell (3, 1) | cell (3, 2) | cell (3, 3) |
# rvest
%>%
rowandcolspan read_html() %>%
html_table()
## [[1]]
## # A tibble: 1 × 5
## `Header (1:2, 1:2)` `Header (1:2, 1:2)` `Header (2, 3)` `` ``
## <chr> <chr> <chr> <chr> <chr>
## 1 Header (1:2, 1:2) Header (1:2, 1:2) cell (3, 1) cell (3, 2) cell (3, …
# unpivotr
%>%
rowandcolspan read_html() %>%
as_cells()
## [[1]]
## # A tibble: 10 × 4
## row col data_type html
## <int> <int> <chr> <chr>
## 1 1 1 html "<th colspan=\"2\" rowspan=\"2\">Header (1:2, 1:2)</th…
## 2 2 1 html <NA>
## 3 1 2 html <NA>
## 4 2 2 html <NA>
## 5 1 3 html "<th>Header (2, 3)</th>"
## 6 2 3 html "<td>cell (3, 1)</td>"
## 7 1 4 html <NA>
## 8 2 4 html "<td>cell (3, 2)</td>"
## 9 1 5 html <NA>
## 10 2 5 html "<td>cell (3, 3)</td>"
as_cells()
never descends into cells. If there is a
table inside a cell, then to parse that table use
html_table
again on that cell.
<- system.file("extdata", "nested.html", package = "unpivotr")
nested includeHTML(nested)
Header (1, 1) | Header (1, 2) | ||||
---|---|---|---|---|---|
cell (2, 1) |
|
# rvest parses both tables
%>%
nested read_html() %>%
html_table(fill = TRUE)
## [[1]]
## # A tibble: 3 × 6
## `Header (1, 1)` `Header (1, 2)` `` `` `` ``
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 cell (2, 1) "Header (2, 2)(1, 1)\n … Head… Head… cell… cell…
## 2 Header (2, 2)(1, 1) "Header (2, 2)(1, 2)" <NA> <NA> <NA> <NA>
## 3 cell (2, 2)(2, 1) "cell (2, 2)(2, 1)" <NA> <NA> <NA> <NA>
##
## [[2]]
## # A tibble: 1 × 2
## `Header (2, 2)(1, 1)` `Header (2, 2)(1, 2)`
## <chr> <chr>
## 1 cell (2, 2)(2, 1) cell (2, 2)(2, 1)
# unpivotr
<-
x %>%
nested read_html() %>%
as_cells() %>%
1]]
.[[ x
## # A tibble: 4 × 4
## row col data_type html
## <int> <int> <chr> <chr>
## 1 1 1 html "<th>Header (1, 1)</th>"
## 2 2 1 html "<td>cell (2, 1)</td>"
## 3 1 2 html "<th>Header (1, 2)</th>"
## 4 2 2 html "<td>\n <table>\n<tr>\n<th>Header (2, 2)(1, 1)…
# The html of the table inside a cell
<-
cell %>%
x ::filter(row == 2, col == 2) %>%
dplyr$html
. cell
## [1] "<td>\n <table>\n<tr>\n<th>Header (2, 2)(1, 1)</th>\n <th>Header (2, 2)(1, 2)</th>\n </tr>\n<tr>\n<td>cell (2, 2)(2, 1)</td>\n <td>cell (2, 2)(2, 1)</td>\n </tr>\n</table>\n</td>"
# Parsing the table inside the cell
%>%
cell read_html() %>%
as_cells()
## [[1]]
## # A tibble: 4 × 4
## row col data_type html
## <int> <int> <chr> <chr>
## 1 1 1 html <th>Header (2, 2)(1, 1)</th>
## 2 2 1 html <td>cell (2, 2)(2, 1)</td>
## 3 1 2 html <th>Header (2, 2)(1, 2)</th>
## 4 2 2 html <td>cell (2, 2)(2, 1)</td>
A motivation for using unpivotr::as_cells()
is that it
extracts more than just text – it can extract whatever part of the HTML
you need.
Here, we extract URLs.
<- system.file("extdata", "url.html", package = "unpivotr")
urls includeHTML(urls)
Scraping HTML. | ||
Sweet | as? | Yeah, right. |
<- function(x) {
cell_url if (is.na(x)) return(NA)
%>%
x %>%
read_html html_nodes("a") %>%
html_attr("href")
}
<- function(x) {
cell_text if (is.na(x)) return(NA)
%>%
x %>%
read_html html_nodes("a") %>%
html_text()
}
%>%
urls read_html() %>%
as_cells() %>%
1]] %>%
.[[mutate(text = purrr::map(html, cell_text),
url = purrr::map(html, cell_url)) %>%
::unnest(text, url) tidyr
## Warning: unnest() has a new interface. See ?unnest for details.
## Try `df %>% unnest(c(text, url))`, with `mutate()` if needed
## # A tibble: 8 × 6
## row col data_type html text url
## <int> <int> <chr> <chr> <chr> <chr>
## 1 1 1 html "<td colspan=\"2\">\n<a href=\"https://www.… Scra… http…
## 2 1 1 html "<td colspan=\"2\">\n<a href=\"https://www.… HTML. http…
## 3 2 1 html "<td><a href=\"https://cran.r-project.org/\… Sweet http…
## 4 1 2 html <NA> <NA> <NA>
## 5 2 2 html "<td><a href=\"https://cran.r-project.org/p… as? http…
## 6 1 3 html <NA> <NA> <NA>
## 7 2 3 html "<td>\n<a href=\"https://cran.r-project.org… Yeah, http…
## 8 2 3 html "<td>\n<a href=\"https://cran.r-project.org… righ… http…