2013-08-21 28 views
8

我试图从openNLP/NLP包中并行使用词性标记。我需要代码在任何操作系统上工作,所以我选择并行使用parLapply函数(但我可以向其他操作系统无关的选项开放)。在过去,我从parLapply的openNLP软件包中运行tagPOS函数,没有任何问题。然而,openNLP软件包最近的一些变化消除了tagPOS并增加了一些更灵活的选项。 Kurt非常善于帮助我重新使用新软件包的工具中的tagPOS函数。我可以使lapply版本正常工作,但不是并行版本。它一直说节点需要传递给它们更多的变量,直到它最终要求openNLP提供一个非导出的函数。这似乎很奇怪,它会不断要求通过越来越多的变量,这告诉我我错误地设置了parLapply。如何设置tagPOS以并行,独立于操作系统的方式运行?并行parLapply设置

library(openNLP) 
library(NLP) 
library(parallel) 

## POS tagger 
tagPOS <- function(x, pos_tag_annotator, ...) { 
    s <- as.String(x) 
    ## Need sentence and word token annotations. 
    word_token_annotator <- Maxent_Word_Token_Annotator() 
    a2 <- Annotation(1L, "sentence", 1L, nchar(s)) 
    a2 <- annotate(s, word_token_annotator, a2) 
    a3 <- annotate(s, pos_tag_annotator, a2) 

    ## Determine the distribution of POS tags for word tokens. 
    a3w <- a3[a3$type == "word"] 
    POStags <- unlist(lapply(a3w$features, `[[`, "POS")) 

    ## Extract token/POS pairs (all of them): easy. 
    POStagged <- paste(sprintf("%s/%s", s[a3w], POStags), collapse = " ") 
    list(POStagged = POStagged, POStags = POStags) 
} ## End of tagPOS function 

## Set up a parallel run 
text.var <- c("I like it.", "This is outstanding soup!", 
    "I really must get the recipe.") 
ntv <- length(text.var) 
PTA <- Maxent_POS_Tag_Annotator() 

cl <- makeCluster(mc <- getOption("cl.cores", detectCores()/2)) 
clusterExport(cl=cl, varlist=c("text.var", "ntv", 
    "tagPOS", "PTA", "as.String", "Maxent_Word_Token_Annotator"), 
    envir = environment()) 
m <- parLapply(cl, seq_len(ntv), function(i) { 
     x <- tagPOS(text.var[i], PTA) 
     return(x) 
    } 
) 
stopCluster(cl) 

## Error in checkForRemoteErrors(val) : 
## 3 nodes produced errors; first error: could not find function 
## "Maxent_Simple_Word_Tokenizer" 

openNLP::Maxent_Simple_Word_Tokenizer 

## >openNLP::Maxent_Simple_Word_Tokenizer 
## Error: 'Maxent_Simple_Word_Tokenizer' is not an exported 
##  object from 'namespace:openNLP' 

## It's a non exported function 
openNLP:::Maxent_Simple_Word_Tokenizer 


## Demo that it works with lapply 
lapply(seq_len(ntv), function(i) { 
    tagPOS(text.var[i], PTA) 
}) 

lapply(text.var, function(x) { 
    tagPOS(x, PTA) 
}) 

## >  lapply(seq_len(ntv), function(i) { 
## +   tagPOS(text.var[i], PTA) 
## +  }) 
## [[1]] 
## [[1]]$POStagged 
## [1] "I/PRP like/IN it/PRP ./." 
## 
## [[1]]$POStags 
## [1] "PRP" "IN" "PRP" "." 
## 
## [[1]]$word.count 
## [1] 3 
## 
## 
## [[2]] 
## [[2]]$POStagged 
## [1] "THis/DT is/VBZ outstanding/JJ soup/NN !/." 
## 
## [[2]]$POStags 
## [1] "DT" "VBZ" "JJ" "NN" "." 
## 
## [[2]]$word.count 
## [1] 4 
## 
## 
## [[3]] 
## [[3]]$POStagged 
## [1] "I/PRP really/RB must/MD get/VB the/DT recip/NN ./." 
## 
## [[3]]$POStags 
## [1] "PRP" "RB" "MD" "VB" "DT" "NN" "." 
## 
## [[3]]$word.count 
## [1] 6 

编辑:每史蒂夫的建议

注意openNLP是全新的。我从CRAN的tar.gz安装了ver 2.1。即使此函数存在,我也会收到以下错误。

library(openNLP);库(NLP);库(平行)

tagPOS <- function(text.var, pos_tag_annotator, ...) { 
    s <- as.String(text.var) 

    ## Set up the POS annotator if missing (for parallel) 
    if (missing(pos_tag_annotator)) { 
     PTA <- Maxent_POS_Tag_Annotator() 
    } 

    ## Need sentence and word token annotations. 
    word_token_annotator <- Maxent_Word_Token_Annotator() 
    a2 <- Annotation(1L, "sentence", 1L, nchar(s)) 
    a2 <- annotate(s, word_token_annotator, a2) 
    a3 <- annotate(s, PTA, a2) 

    ## Determine the distribution of POS tags for word tokens. 
    a3w <- a3[a3$type == "word"] 
    POStags <- unlist(lapply(a3w$features, "[[", "POS")) 

    ## Extract token/POS pairs (all of them): easy. 
    POStagged <- paste(sprintf("%s/%s", s[a3w], POStags), collapse = " ") 
    list(POStagged = POStagged, POStags = POStags) 
} 

text.var <- c("I like it.", "This is outstanding soup!", 
    "I really must get the recipe.") 

cl <- makeCluster(mc <- getOption("cl.cores", detectCores()/2)) 
clusterEvalQ(cl, {library(openNLP); library(NLP)}) 
m <- parLapply(cl, text.var, tagPOS) 

## > m <- parLapply(cl, text.var, tagPOS) 
## Error in checkForRemoteErrors(val) : 
## 3 nodes produced errors; first error: could not find function "Maxent_POS_Tag_Annotator" 

stopCluster(cl) 


> packageDescription('openNLP') 
Package: openNLP 
Encoding: UTF-8 
Version: 0.2-1 
Title: Apache OpenNLP Tools Interface 
[email protected]: person("Kurt", "Hornik", role = c("aut", "cre"), email = 
      "[email protected]") 
Description: An interface to the Apache OpenNLP tools (version 1.5.3). The Apache OpenNLP 
      library is a machine learning based toolkit for the processing of natural language 
      text written in Java. It supports the most common NLP tasks, such as tokenization, 
      sentence segmentation, part-of-speech tagging, named entity extraction, chunking, 
      parsing, and coreference resolution. See http://opennlp.apache.org/ for more 
      information. 
Imports: NLP (>= 0.1-0), openNLPdata (>= 1.5.3-1), rJava (>= 0.6-3) 
SystemRequirements: Java (>= 5.0) 
License: GPL-3 
Packaged: 2013-08-20 13:23:54 UTC; hornik 
Author: Kurt Hornik [aut, cre] 
Maintainer: Kurt Hornik <[email protected]> 
NeedsCompilation: no 
Repository: CRAN 
Date/Publication: 2013-08-20 15:41:22 
Built: R 3.0.1; ; 2013-08-20 13:48:47 UTC; windows 

回答

5

既然你打电话从NLP上的集群工人功能,你应该在调用parLapply之前加载它在每一个工人。你可以做到这一点从工人功能,但我倾向于创建群集对象后使用权clusterCallclusterEvalQ

clusterEvalQ(cl, {library(openNLP); library(NLP)}) 

由于as.StringMaxent_Word_Token_Annotator在这些包,他们不应该被导出。

请注意,在我的机器上运行您的示例时,我注意到PTA对象在导出到工作机器后不起作用。据推测,该对象中的某些内容不能安全地序列化和非序列化。在我使用clusterEvalQ在工作人员上创建该对象之后,该示例成功运行。这是使用openNLP 0.2-1:

library(parallel) 
tagPOS <- function(x, ...) { 
    s <- as.String(x) 
    word_token_annotator <- Maxent_Word_Token_Annotator() 
    a2 <- Annotation(1L, "sentence", 1L, nchar(s)) 
    a2 <- annotate(s, word_token_annotator, a2) 
    a3 <- annotate(s, PTA, a2) 
    a3w <- a3[a3$type == "word"] 
    POStags <- unlist(lapply(a3w$features, `[[`, "POS")) 
    POStagged <- paste(sprintf("%s/%s", s[a3w], POStags), collapse = " ") 
    list(POStagged = POStagged, POStags = POStags) 
} 
text.var <- c("I like it.", "This is outstanding soup!", 
    "I really must get the recipe.") 
cl <- makeCluster(mc <- getOption("cl.cores", detectCores()/2)) 
clusterEvalQ(cl, { 
    library(openNLP) 
    library(NLP) 
    PTA <- Maxent_POS_Tag_Annotator() 
}) 
m <- parLapply(cl, text.var, tagPOS) 
print(m) 
stopCluster(cl) 

如果clusterEvalQ失败,因为Maxent_POS_Tag_Annotator是找不到的,你可能会被装上了工人的错版openNLP的。你可以决定你用clusterEvalQ执行sessionInfo混得工人什么包版本:

library(parallel) 
cl <- makeCluster(2) 
clusterEvalQ(cl, {library(openNLP); library(NLP)}) 
clusterEvalQ(cl, sessionInfo()) 

这将返回在每个群集工人执行sessionInfo()的结果。以下是我使用的一些软件包的版本信息,适用于我:

other attached packages: 
[1] NLP_0.1-0  openNLP_0.2-1 

loaded via a namespace (and not attached): 
[1] openNLPdata_1.5.3-1 rJava_0.9-4 
+0

谢谢您的信息。虽然同样的错误仍然存​​在。 –

+0

@TylerRinker如果您没有明确导出Maxent_Word_Token_Annotator,它会有帮助吗? –

+0

Steve再次感谢您的时间。我犯了同样的错误。看到我上面的编辑。你可以尝试openNLP 2.1吗?我无法尝试2.0,因为我的电脑没有互联网。请注意2.1 –