-1
对不起,这是一个奇怪的问题,但我似乎无法自己弄清楚。好消息是:我认为它是完全可重复的。古怪的行为与Rvest维基百科刮
我正在尝试构建一个简单的R函数来使用{rvest}为维基百科搜索音乐家的故乡。基本上,我写的功能是作品,但对于某些艺术家来说,它不起作用(返回NULL
)。 (兰迪纽曼就是这样的,所以我会用他作为一个例子。)
当我刚刚运行整个事情(如下图),然后findHome("randy newman")
我得到NULL
但后来当我尝试调试,我运行tableMusic()
功能,然后artist <- "randy newman"
,然后运行所有的功能artistData()
胆量,IT WORKS!
然后,一旦我这样做,我可以运行findHome("randy newman")
,它会正常工作。是什么赋予了?!我是否有错误的顺序或东西?我似乎无法弄清楚。
任何帮助,非常感谢。这里是代码:
library(rvest)
findHome <- function(artist) {
##function to look for the table with the right info
tableMusic <- function(data) {
if(!any(grepl("years active|labels|instruments", data[,1], ignore.case=T))) {
for (i in 2:5) {
data <- try(url %>% html %>% html_nodes(xpath=paste('//*[@id="mw-content-text"]/table[', i, ']', sep="")) %>% html_table(fill=T), silent=T)
if(!class(data)=="try-error" & length(data)>0) {
if(class(data)!="data.frame") {data <- data.frame(data, stringsAsFactors=F)}
if(any(grepl("years active|labels|instruments", data[,1], ignore.case=T))) {
break
}
}
}
}
if(class(data)=="try-error" | length(data)<1) {
data <- NULL
} else if (!any(grepl("years active|labels|instruments", data[,1], ignore.case=T))) {
data <- NULL
}
data
}
#function to pull data and try different pages if the first is wrong
artistData <- function(artist) {
artist <- gsub(" ", "_", artist)
artist <- gsub("'", "%27", artist)
##first try getting the data
url <- paste("https://en.wikipedia.org/wiki/", artist, sep="")
data <- try(url %>% html %>% html_nodes(xpath='//*[@id="mw-content-text"]/table[1]') %>% html_table(fill=T), silent=T)
##check if it's the right page (deal with disambiguation issues)
if(!class(data)=="try-error" & length(data)>0) {
if(class(data)!="data.frame") {data <- data.frame(data, stringsAsFactors=F)}
data <- tableMusic(data)
}
## if try-error or musicTable==NULL, try _(band)
if(class(data)=="try-error" | is.null(data) | length(data)<1) {
url <- paste("https://en.wikipedia.org/wiki/", artist, "_(band)", sep="")
data <- try(url %>% html %>% html_nodes(xpath='//*[@id="mw-content-text"]/table[1]') %>% html_table(fill=T), silent=T)
if(class(data)=="try-error"){
data <- NULL
} else {
if(class(data)!="data.frame") {data <- data.frame(data, stringsAsFactors=F)}
data <- tableMusic(data)
}
} else {
if(class(data)!="data.frame") {data <- data.frame(data, stringsAsFactors=F)}
}
## if try-error or musicTable==NULL, try _(musician)
if(class(data)=="try-error" | is.null(data) | length(data)<1) {
url <- paste("https://en.wikipedia.org/wiki/", artist, "_(musician)", sep="")
data <- try(url %>% html %>% html_nodes(xpath='//*[@id="mw-content-text"]/table[1]') %>% html_table(fill=T), silent=T)
if(class(data)=="try-error"){
data <- NULL
} else {
if(class(data)!="data.frame") {data <- data.frame(data, stringsAsFactors=F)}
data <- tableMusic(data)
}
} else {
if(class(data)!="data.frame") {data <- data.frame(data, stringsAsFactors=F)}
}
data
}
## first try finding data
data <- artistData(artist)
## try finding with and/&
if(is.null(data)){data <- artistData(unlist(strsplit(artist, " and| &"))[1])}
## if no matches return ""
if(class(data)=="try-error" | is.null(data)) {
data <- ""
return()
} else {
if(class(data)!="data.frame") {data <- data.frame(data, stringsAsFactors=F)}
}
## if we have a matching page, pull the relevant data
origin <- data[data[,1]=="Origin",2]
if(length(origin)>0) {
home <- origin
} else {
born <- data[data[,1]=="Born",2]
if (length(born)>0) {
home <- unlist(strsplit(born, "age.[0-9]+)"))[2]
} else {
home <- ""
}
}
home
}
findHome("randy newman")
与昵称有关吗?或缺少下划线? –