2013-08-20 16 views
16

有人知道将数字的文本表示转换为实际数字的功能,例如: '二万三百五十'到20305.我写在数据帧行的数字,并希望将它们转换为数字。将书面数字转换为数字R

在包qdap,您可以用文字(例如,1001变成一千)代替数字表示的数字,而不是周围的其他方式:

library(qdap) 
replace_number("I like 346457 ice cream cones.") 
[1] "I like three hundred forty six thousand four hundred fifty seven ice cream cones." 
+0

@亨克我重写了一下你的问题,使它更清楚你需要将单词转换为数字,而不是反之亦然。 –

+2

我认为要做的最好的事情是拍摄提交文件的人,并将其中的数字写成文字。好吧,我认为,除了编写一个相当详细的解析算法,其中包含所有数字的巨大数据库('one','two',...'hundred','thousand, '...'googol')以及某种排序器优先。例如,在你的例子中,有两个“百”,但它们具有不同的含义,基于顺序跟随它们的单词。 –

回答

14

这里是一个开始,应该让你几百数千人。

word2num <- function(word){ 
    wsplit <- strsplit(tolower(word)," ")[[1]] 
    one_digits <- list(zero=0, one=1, two=2, three=3, four=4, five=5, 
         six=6, seven=7, eight=8, nine=9) 
    teens <- list(eleven=11, twelve=12, thirteen=13, fourteen=14, fifteen=15, 
        sixteen=16, seventeen=17, eighteen=18, nineteen=19) 
    ten_digits <- list(ten=10, twenty=20, thirty=30, forty=40, fifty=50, 
         sixty=60, seventy=70, eighty=80, ninety=90) 
    doubles <- c(teens,ten_digits) 
    out <- 0 
    i <- 1 
    while(i <= length(wsplit)){ 
     j <- 1 
     if(i==1 && wsplit[i]=="hundred") 
      temp <- 100 
     else if(i==1 && wsplit[i]=="thousand") 
      temp <- 1000 
     else if(wsplit[i] %in% names(one_digits)) 
      temp <- as.numeric(one_digits[wsplit[i]]) 
     else if(wsplit[i] %in% names(teens)) 
      temp <- as.numeric(teens[wsplit[i]]) 
     else if(wsplit[i] %in% names(ten_digits)) 
      temp <- (as.numeric(ten_digits[wsplit[i]])) 
     if(i < length(wsplit) && wsplit[i+1]=="hundred"){ 
      if(i>1 && wsplit[i-1] %in% c("hundred","thousand")) 
       out <- out + 100*temp 
      else 
       out <- 100*(out + temp) 
      j <- 2 
     } 
     else if(i < length(wsplit) && wsplit[i+1]=="thousand"){ 
      if(i>1 && wsplit[i-1] %in% c("hundred","thousand")) 
       out <- out + 1000*temp 
      else 
       out <- 1000*(out + temp) 
      j <- 2 
     } 
     else if(i < length(wsplit) && wsplit[i+1] %in% names(doubles)){ 
      temp <- temp*100 
      out <- out + temp 
     } 
     else{ 
      out <- out + temp 
     } 
     i <- i + j 
    } 
    return(list(word,out)) 
} 

结果:

> word2num("fifty seven") 
[[1]] 
[1] "fifty seven" 

[[2]] 
[1] 57 

> word2num("four fifty seven") 
[[1]] 
[1] "four fifty seven" 

[[2]] 
[1] 457 

> word2num("six thousand four fifty seven") 
[[1]] 
[1] "six thousand four fifty seven" 

[[2]] 
[1] 6457 

> word2num("forty six thousand four fifty seven") 
[[1]] 
[1] "forty six thousand four fifty seven" 

[[2]] 
[1] 46457 

> word2num("forty six thousand four hundred fifty seven") 
[[1]] 
[1] "forty six thousand four hundred fifty seven" 

[[2]] 
[1] 46457 

> word2num("three forty six thousand four hundred fifty seven") 
[[1]] 
[1] "three forty six thousand four hundred fifty seven" 

[[2]] 
[1] 346457 

我可以告诉你已知道,这不会为word2num("four hundred thousand fifty")工作,因为它不知道如何处理连续的“百强”和“千”的术语,但该算法可能会被修改。任何人如果有改进或者在自己的答案中构建它们,都可以随意编辑。我只是觉得这是一个有趣的问题(有一段时间)。

编辑:显然Bill Venables有一个叫english的包,可能比上面的代码更好。

+0

试图看看英文软件包在哪里可以做到这一点。它似乎只是另一种方式,但也许我错过了这一点? –

-1

这是我认为是更好的解决方案。

library(stringdist) 
    library(gdata) 
    #Convert numeric words to digits 
isNumericWord=function(string, dist=1, method="dl"){ 
    nums=c("zero","one","two","three","four","five","six","seven","eight","nine", 
     "ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen","nineteen", 
     "twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety", 
     "hundred","thousand","million","billion","trillion") 
    return(any(stringdist(tolower(string),nums,method=method)<=dist)) 
} 
numberTypes=function(string, dist=1, method="dl"){ 
    nums=c("zero","one","two","three","four","five","six","seven","eight","nine", 
     "ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen","nineteen", 
     "twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety", 
     "hundred","thousand","million","billion","trillion") 
    string=gsub("[[:punct:]]"," ",string) 
    wrdsplit=strsplit(string,split=" ")[[1]] 
    wrdsplit=wrdsplit[wrdsplit!=""] 
    #Handle number types 
    wrdsplit=ifelse(stringdist("first",tolower(wrdsplit),method=method)<=dist,"one st",wrdsplit) 
    wrdsplit=ifelse(stringdist("second",tolower(wrdsplit),method=method)<=dist,"two nd",wrdsplit) 
    wrdsplit=ifelse(stringdist("third",tolower(wrdsplit),method=method)<=dist & 
        tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","three rd",wrdsplit) 
    wrdsplit=ifelse(stringdist("fourth",tolower(wrdsplit),method=method)<=dist & 
        tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","four th",wrdsplit) 
    wrdsplit=ifelse(stringdist("fifth",tolower(wrdsplit),method=method)<=dist & 
        tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","five th",wrdsplit) 
    wrdsplit=ifelse(stringdist("sixth",tolower(wrdsplit),method=method)<=dist & 
        tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","six th",wrdsplit) 
    wrdsplit=ifelse(stringdist("seventh",tolower(wrdsplit),method=method)<=dist & 
        tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","seven th",wrdsplit) 
    wrdsplit=ifelse(stringdist("eighth",tolower(wrdsplit),method=method)<=dist & 
        tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","eight th",wrdsplit) 
    wrdsplit=ifelse(stringdist("ninth",tolower(wrdsplit),method=method)<=dist & 
        tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","nine th",wrdsplit) 
    wrdsplit=ifelse(stringdist("tenth",tolower(wrdsplit),method=method)<=dist,"ten th",wrdsplit) 
    wrdsplit=ifelse(stringdist("twentieth",tolower(wrdsplit),method=method)<=dist,"twenty th",wrdsplit) 
    wrdsplit=ifelse(stringdist("thirtieth",tolower(wrdsplit),method=method)<=dist,"thirty th",wrdsplit) 
    wrdsplit=ifelse(stringdist("fortieth",tolower(wrdsplit),method=method)<=dist,"forty th",wrdsplit) 
    wrdsplit=ifelse(stringdist("fiftieth",tolower(wrdsplit),method=method)<=dist,"fifty th",wrdsplit) 
    wrdsplit=ifelse(stringdist("sixtieth",tolower(wrdsplit),method=method)<=dist,"sixty th",wrdsplit) 
    wrdsplit=ifelse(stringdist("seventieth",tolower(wrdsplit),method=method)<=dist,"seventy th",wrdsplit) 
    wrdsplit=ifelse(stringdist("eightieth",tolower(wrdsplit),method=method)<=dist,"eighty th",wrdsplit) 
    wrdsplit=ifelse(stringdist("ninetieth",tolower(wrdsplit),method=method)<=dist,"ninety th",wrdsplit) 
    #Handle other number words that end in "th" 
    if(length(wrdsplit)>0){ 
    for(i in 1:length(wrdsplit)){ 
     substr_end=substr(wrdsplit[i],(nchar(wrdsplit[i])-1),nchar(wrdsplit[i])) 
     substr_beg=substr(wrdsplit[i],1,(nchar(wrdsplit[i])-2)) 
     if(substr_end=="th" & nchar(wrdsplit[i])!=2 & any(stringdist(tolower(substr_beg),nums,method=method)<=dist)){ 
     wrdsplit[i]=paste(substr_beg, substr_end,sep=" ") 
     } 
    } 
    return(gsub(" "," ",paste(wrdsplit,collapse=" "))) 
    }else{ 
    return("") 
    } 
} 

#Convert number words to digits 
Word2Num=function(string, dist=1, method="dl"){ 
    original=string 
    #Define numbers 
    one_digits = list(zero=0, one=1, two=2, three=3, four=4, five=5, 
        six=6, seven=7, eight=8, nine=9) 
    teens = list(eleven=11, twelve=12, thirteen=13, fourteen=14, fifteen=15, 
       sixteen=16, seventeen=17, eighteen=18, nineteen=19) 
    ten_digits = list(ten=10, twenty=20, thirty=30, forty=40, fifty=50, 
        sixty=60, seventy=70, eighty=80, ninety=90) 
    large_digits = list(hundred=100, thousand=1000, million=1e6, billion=1e9, trillion=1e12) 
    double_digits = c(teens,ten_digits) 

    #Split the string into words 
    string=gsub("-"," ",gsub(" & ", " and ",string,ignore.case=T)) 
    string=numberTypes(string) 
    wrdsplit=strsplit(tolower(string)," ")[[1]] 
    wrdsplit=wrdsplit[wrdsplit!=""] 
    isNumber=apply(data.frame(wrdsplit),1,isNumericWord) 

    #Find groups of numbers 
    if(exists("groups")){ 
    suppressWarnings(rm(groups)) 
    } 
    i=1 
    while(i <= length(wrdsplit)){ 
    if(isNumber[i]==T){ 
     if(!exists("groups")){ 
     groups=list(wrdsplit[i]) 
     }else if(exists("groups")){ 
     groups=c(groups, wrdsplit[i]) 
     } 
     for(j in (i+1):length(wrdsplit)){ 
     if(isNumber[j]){ 
      groups[[length(groups)]]=c(groups[[length(groups)]],wrdsplit[j]) 
      i=j+1 
     }else{ 
      i=i+1 
      break 
     } 
     } 
    }else{ 
     i=i+1 
    } 
    } 

    #Convert numeric words to numbers 
    if(exists("groups")){ 
    groupNums=groups 
    for(j in 1:length(groups)){ 
     for(i in 1:length(groups[[j]])){ 
     #If word is a single digit number 
     if(any(stringdist(groups[[j]][i],names(one_digits),method=method)<=dist & 
       tolower(substr(groups[[j]][i],nchar(groups[[j]][i]),nchar(groups[[j]][i])))!="y")){ 
      #If word is a single digit number 
      groupNums[[j]][i]=one_digits[stringdist(groups[[j]][i],names(one_digits),method=method)<=dist][[1]] 
     }else if(any(stringdist(groups[[j]][i],names(double_digits),method=method)<=dist)){ 
      #If word is a double digit number 
      groupNums[[j]][i]=double_digits[stringdist(groups[[j]][i],names(double_digits),method=method)<=dist][[1]] 
     }else if(any(stringdist(groups[[j]][i],names(large_digits),method=method)<=dist)){ 
      #If word is a large digit number 
      groupNums[[j]][i]=large_digits[stringdist(groups[[j]][i],names(large_digits),method=method)<=dist][[1]] 
     } 
     } 
    } 

    #Convert the separated numbers to a single number 
    defscipen=options("scipen")[[1]] 
    options(scipen=999) 
    for(i in 1:length(groups)){ 
     if(length(groupNums[[i]])==1){ 
     groupNums[[i]]=as.numeric(groupNums[[i]][1]) 
     }else{ 
     while(length(groupNums[[i]])>=2){ 
      if(nchar(groupNums[[i]][2])>nchar(groupNums[[i]][1])){ 
      #If the next word has more digits than the current word, multiply them 
      temp=as.numeric(groupNums[[i]][1])*as.numeric(groupNums[[i]][2]) 
      }else if(nchar(groupNums[[i]][2])<nchar(groupNums[[i]][1])){ 
      #if the next word has less digits than the current word, add them 
      temp=as.numeric(groupNums[[i]][1])+as.numeric(groupNums[[i]][2]) 
      } 
      #Combine the results 
      if(length(groupNums[[i]])>2){ 
      groupNums[[i]]=c(temp, groupNums[[i]][3:length(groupNums[[i]])]) 
      }else{ 
      groupNums[[i]]=temp 
      } 
     } 
     } 
    } 
    #Recreate the original string 
    groupNums=lapply(groupNums, as.character) 
    options(scipen=defscipen) 
    for(i in 1:length(groups)){ 
     wrdsplit[which(wrdsplit==groups[[i]][1])]=groupNums[[i]][1] 
     if(length(groups[[i]]>1)){ 
     wrdsplit[which(wrdsplit==groups[[i]][2:length(groups)])]="" 
     } 
    } 
    #Combine numbers with their endings 
    wrdsplit=wrdsplit[wrdsplit!=""] 
    if(any(wrdsplit[which(wrdsplit %in% unlist(groupNums))+1] %in% c("rd","th","st","nd"))){ 
     locs=which(wrdsplit %in% unlist(groupNums)) 
     for(i in length(locs):1){ 
     wrdsplit[locs[i]]=paste(wrdsplit[c(locs[i],(locs[i]+1))],collapse="") 
     wrdsplit=wrdsplit[-(locs[i]+1)] 
     } 
    } 
    return(trim(paste(wrdsplit,collapse=" "))) 
    }else{ 
    return(original) 
    } 
}