2015-09-25 25 views
-1

对不起,这是一个奇怪的问题,但我似乎无法自己弄清楚。好消息是:我认为它是完全可重复的。古怪的行为与Rvest维基百科刮

我正在尝试构建一个简单的R函数来使用{rvest}为维基百科搜索音乐家的故乡。基本上,我写的功能是作品,但对于某些艺术家来说,它不起作用(返回NULL)。 (兰迪纽曼就是这样的,所以我会用他作为一个例子。)

当我刚刚运行整个事情(如下图),然后findHome("randy newman")我得到NULL但后来当我尝试调试,我运行tableMusic()功能,然后artist <- "randy newman",然后运行所有的功能artistData()胆量,IT WORKS!

然后,一旦我这样做,我可以运行findHome("randy newman"),它会正常工作。是什么赋予了?!我是否有错误的顺序或东西?我似乎无法弄清楚。

任何帮助,非常感谢。这里是代码:

library(rvest) 
findHome <- function(artist) { 
##function to look for the table with the right info 
tableMusic <- function(data) { 
    if(!any(grepl("years active|labels|instruments", data[,1], ignore.case=T))) { 
     for (i in 2:5) { 
      data <- try(url %>% html %>% html_nodes(xpath=paste('//*[@id="mw-content-text"]/table[', i, ']', sep="")) %>% html_table(fill=T), silent=T) 
      if(!class(data)=="try-error" & length(data)>0) { 
       if(class(data)!="data.frame") {data <- data.frame(data, stringsAsFactors=F)} 
       if(any(grepl("years active|labels|instruments", data[,1], ignore.case=T))) { 
        break 
       } 
      } 
     } 
    } 
    if(class(data)=="try-error" | length(data)<1) { 
     data <- NULL 
    } else if (!any(grepl("years active|labels|instruments", data[,1], ignore.case=T))) { 
     data <- NULL 
    } 
    data 
} 
#function to pull data and try different pages if the first is wrong 
artistData <- function(artist) { 
    artist <- gsub(" ", "_", artist) 
    artist <- gsub("'", "%27", artist) 
    ##first try getting the data 
    url <- paste("https://en.wikipedia.org/wiki/", artist, sep="") 
    data <- try(url %>% html %>% html_nodes(xpath='//*[@id="mw-content-text"]/table[1]') %>% html_table(fill=T), silent=T) 
    ##check if it's the right page (deal with disambiguation issues) 
    if(!class(data)=="try-error" & length(data)>0) { 
     if(class(data)!="data.frame") {data <- data.frame(data, stringsAsFactors=F)} 
     data <- tableMusic(data) 
    } 
    ## if try-error or musicTable==NULL, try _(band) 
    if(class(data)=="try-error" | is.null(data) | length(data)<1) { 
     url <- paste("https://en.wikipedia.org/wiki/", artist, "_(band)", sep="") 
     data <- try(url %>% html %>% html_nodes(xpath='//*[@id="mw-content-text"]/table[1]') %>% html_table(fill=T), silent=T) 
     if(class(data)=="try-error"){ 
      data <- NULL 
     } else { 
      if(class(data)!="data.frame") {data <- data.frame(data, stringsAsFactors=F)} 
      data <- tableMusic(data) 
     } 
    } else { 
     if(class(data)!="data.frame") {data <- data.frame(data, stringsAsFactors=F)} 
    } 
    ## if try-error or musicTable==NULL, try _(musician) 
    if(class(data)=="try-error" | is.null(data) | length(data)<1) { 
     url <- paste("https://en.wikipedia.org/wiki/", artist, "_(musician)", sep="") 
     data <- try(url %>% html %>% html_nodes(xpath='//*[@id="mw-content-text"]/table[1]') %>% html_table(fill=T), silent=T) 
     if(class(data)=="try-error"){ 
      data <- NULL 
     } else { 
      if(class(data)!="data.frame") {data <- data.frame(data, stringsAsFactors=F)} 
      data <- tableMusic(data) 
     } 
    } else { 
     if(class(data)!="data.frame") {data <- data.frame(data, stringsAsFactors=F)} 
    } 
    data 
} 
## first try finding data 
data <- artistData(artist) 
## try finding with and/& 
if(is.null(data)){data <- artistData(unlist(strsplit(artist, " and| &"))[1])} 
## if no matches return "" 
if(class(data)=="try-error" | is.null(data)) { 
    data <- "" 
    return() 
} else { 
    if(class(data)!="data.frame") {data <- data.frame(data, stringsAsFactors=F)} 
} 

## if we have a matching page, pull the relevant data 
origin <- data[data[,1]=="Origin",2] 
if(length(origin)>0) { 
    home <- origin 
} else { 
    born <- data[data[,1]=="Born",2] 
    if (length(born)>0) { 
     home <- unlist(strsplit(born, "age.[0-9]+)"))[2] 
    } else { 
     home <- "" 
    } 
} 
home 
} 

findHome("randy newman") 
+0

与昵称有关吗?或缺少下划线? –

回答

0

我想通了。我必须在tableMusic()函数中添加url参数。就像这样,它会从以前的搜索中回收网址。感谢您的建议。