此:
library(rvest)
library(httr)
library(tibble)
library(purrr)
pg <- read_html("http://www.fiskistofa.is/english/quotas-and-catches/total-catches-by-harbours-months-and-vessel-type/")
html_nodes(pg, "select#fyrirsp>option") %>%
map_df(~data_frame(value=html_attr(., "value"), display=html_text(., trim=TRUE)))
## # A tibble: 5 × 2
## value display
## <chr> <chr>
## 1 ateh By landing harbor
## 2 hteh Main species by landing harbor
## 3 atem By month
## 4 ateu By vessel type
## 5 atev By fishing gear
让你在select
下拉值。
但是,你(可能)真正需要的是这样的:
#' Get catch data by various parameters
#'
#' @param search_by either one of ateh, hteh, atem, ateum, atev or their display name equivalents
#' @param dagur_fra,hnappur from/to dates (either an R Date object or a date in dd.mm.yyyy string format)
#' @param lang language (prbly not relevant but it was part of the HTTP request). "en" by default and I assume a subset of ISO2C is valid
get_catch_data <- function(search_by, dagur_fra, dagur_til, lang="en") {
require(httr)
require(rvest)
require(tibble)
URL <- "http://www.fiskistofa.is/english/quotas-and-catches/total-catches-by-harbours-months-and-vessel-type/bbt.jsp"
# this will enable us to translate the search parameter to what we need
# match.arg also gives us error checking the param for free
by_trans <- c(`by landing harbor`="ateh", `ateh`="ateh",
`main species by landing harbor`="hteh", `hteh`="hteh",
`by month`="atem", `atem`="atem",
`by vessel type`="ateu", `ateu`="ateu",
`by fishing gear`="atev", `atev`="atev")
search_by <- match.arg(by_trans[tolower(search_by)],
c("ateh", "hteh", "atem", "ateu", "atev"))
# if strings are not passed in for dates, make them strings
if (inherits(dagur_fra, "Date")) dagur_fra <- format(dagur_fra, "%d.%m.%Y")
if (inherits(dagur_til, "Date")) dagur_til <- format(dagur_til, "%d.%m.%Y")
# shld prbly add date string format error checking code here
# the form is submitted via HTTP POST. This fills in the form & submits it
res <- POST(url = URL,
query = list(lang=lang),
body = list(fyrirsp = search_by,
dagurFra = dagur_fra,
dagurTil = dagur_til,
hnappur = "Enter"),
encode = "form")
# error checking the result (will bail if anyting but a HTTP 200 response
stop_for_status(res)
# parse the resultant HTML
doc <- content(res, as="parsed")
# find and extract the table
# this makes an assumption that the display table always has "Total" and
# no other tables on the page will. a somewhat fragile assumption that shld
# be the first thing you debug if there are errors at some point down the road
html_nodes(doc, xpath=".//table[contains(., 'Total')]") %>%
html_table(header=TRUE) -> out
as_tibble(out[[1]])
}
^^是从提交表单得到的数据表的功能。例如:
get_catch_data("ateh", as.Date("2016-07-01"), as.Date("2016-12-07"))
## # A tibble: 71 × 64
## Species Vestmannaeyjar Þorlákshöfn Grindavík Sandgerði
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Cod 3.262 539 2.708 953
## 2 Haddock 1.521 145 560.000 93
## 3 Saithe 2.849 318 791.000 160
## 4 Redfish 2.580 642 1.499 51
## 5 Ling 316.000 387 216.000 41
## 6 Blueling, European ling 62.000 22 13.000 3
## 7 Tusk, torsk, cusk 8.000 50 184.000 1
## 8 Atlantic wolffish 77.000 29 83.000 14
## 9 Monkfish 82.000 44 18.000 40
## 10 Greater argentine, 4.000 0 54.000 0
## # ... with 61 more rows, and 59 more variables: Keflavík <dbl>,
## # Hafnarfjörður <dbl>, Kópavogur <int>, Reykjavík <dbl>, Akranes <int>,
## # Arnarstapi <int>, Rif <dbl>, Ólafsvík <dbl>, Grundarfjörður <dbl>,
## # Stykkishólmur <dbl>, Búðardalur <int>, Brjánslækur <int>,
## # Patreksfjörður <dbl>, Tálknafjörður <int>, Bíldudalur <int>,
## # Þingeyri <dbl>, Flateyri <dbl>, Suðureyri <dbl>, Bolungarvík <dbl>,
## # Ísafjörður <dbl>, Súðavík <int>, Norðurfjörður <int>, Drangsnes <int>,
## # Hólmavík <int>, Hvammstangi <int>, Skagaströnd <dbl>,
## # Sauðárkrókur <dbl>, Hofsós <int>, Siglufjörður <dbl>,
## # Ólafsfjörður <int>, Grímsey <int>, Hrísey <int>, Dalvík <dbl>,
## # Árskógssandur <int>, Hauganes <int>, Akureyri <dbl>, Húsavík <dbl>,
## # Kópasker <int>, Raufarhöfn <dbl>, Þórshöfn <dbl>, Bakkafjörður <int>,
## # Vopnafjörður <dbl>, `Borgarfjörður Eystri` <int>, Seyðisfjörður <dbl>,
## # Mjóifjörður <int>, Neskaupstaður <dbl>, Eskifjörður <dbl>,
## # Reyðarfjörður <dbl>, Fáskrúðsfjörður <dbl>, Stöðvarfjörður <dbl>,
## # Breiðdalsvík <int>, Djúpivogur <dbl>, Hornafjörður <dbl>, `Ýmsir
## # staðir` <int>, Noregur <int>, Færeyjar <dbl>, Holland <dbl>,
## # Skarðsstöð <int>, Total <dbl>
和:
get_catch_data("atem", as.Date("2016-07-01"), as.Date("2016-07-31"))
## # A tibble: 50 × 3
## Species `July 2016` Total
## <chr> <dbl> <dbl>
## 1 Cod 13.908 13.908
## 2 Haddock 2.063 2.063
## 3 Saithe 5.539 5.539
## 4 Redfish 4.265 4.265
## 5 Ling 302.000 302.000
## 6 Blueling, European ling 38.000 38.000
## 7 Tusk, torsk, cusk 29.000 29.000
## 8 Atlantic wolffish 506.000 506.000
## 9 Monkfish 106.000 106.000
## 10 Greater argentine, 180.000 180.000
## # ... with 40 more rows
和:
get_catch_data("ateu", as.Date("2016-07-01"), as.Date("2016-12-07"))
## # A tibble: 71 × 9
## Species `Costal fishing` `Research vessel` Trawler
## <chr> <dbl> <int> <dbl>
## 1 Cod 3.747 33 47.325
## 2 Haddock 13.000 8 4.785
## 3 Saithe 150.000 4 16.182
## 4 Redfish 46.000 28 19.330
## 5 Ling 1.000 0 357.000
## 6 Blueling, European ling 0.000 1 290.000
## 7 Tusk, torsk, cusk 2.000 0 21.000
## 8 Atlantic wolffish 2.000 0 710.000
## 9 Monkfish 0.000 0 45.000
## 10 Greater argentine, 0.000 0 1.334
## # ... with 61 more rows, and 5 more variables: `Quota vessel` <dbl>,
## # `Small quota boat` <dbl>, `Recreational fishery` <int>, `Hook and line
## # boat` <dbl>, Total <dbl>
(你的想法)。
一些注意事项:
- 你不需要
tibble
,但我更喜欢使用的是内置的数据帧生成功能,原因是多方面的。
- 您应该添加更多的参数错误检查。
- 你应该清理一些返回的数据帧的列名称
- 你应该清理(即移除)结果数据帧中的“总”字段/行。
- 我没有找到使“通过渔具”返回任何数据的参数组合(页面本身)。
你想用Java或Javascript做? –
不了解Java。怎么样的Javascript? – Lain
您提供的网站是以JAVA语言开发的。所以你的第一步:只是**切丁**其中**技术你想开发**你的东西....! –