2012-04-25 78 views
0

如何有效地解析(没有太多代码混乱)像下面的语句? 关键字/分隔符位于[]中。斯卡拉句子解析使用解析器组合器

经理,德里[供]本公司私人有限公司[从] 2009年1月[来]扬,2012年

的人姓名,公司名称和日期范围是从使用文本解析提取组合子。 (预期输出在底部示出)

下面是上文



    case class CompanyWithMonthDateRange(company:String, position:String, dateRange:List[MonthYear]) 

    case class MonthYear(month:String, year:Int) 

    object CompanyParser1 extends RegexParsers { 
     override type Elem = Char 
     override def skipWhitespace = false 
     def keywords: Parser[String] = "for" | "in" | "with" |"at" | "from" | "pvt"|"ltd" | "company" | "co" | "limited" | "inc" | "corporation" | "jan" |\ 
    "feb" | "mar" | "apr" | "may" | "jun" | "jul" | "aug" | "sep" | "nov" | "dec" | "to" | "till" | "until" | "upto" 

     val date = ("""\d\d\d\d""".r | """\d\d""".r) 
     val integer  = ("""(0|[1-9]\d*)""".r) ^^ { _.toInt } 
     val comma = ("""\,""".r) 
     val quote = ("""[\'\"]+""".r) 
     val underscore = ("""\_""".r) 
     val dot = ("""\.""".r) 
     val space = ("""\s+""".r) ^^ {case _ => ""} 
     val colon = (""":""".r) 
     val ampersand = ("""(\&|and)""".r) 
     val hyphen = ("""\-""".r) 
     val brackets = ("""[\(\)]+""".r) 
     val newline = ("""[\n\r]""".r) 
     val months = ("""(jan|feb|mar|apr|may|jun|jul|aug|sep|nov|dec)""".r) 
     val toTillUntil = ("""(to|till|until|upto)""".r) 
     val asWord = ("""(as)""".r) 
     val fromWord = ("""from""".r) 
     val forWithAt = ("""(in|for|with|at)""".r) 
     val companyExt = ("""(pvt|ltd|company|co|limited|inc|corporation)""".r) 
     val alphabets = not(keywords)~"""[a-zA-Z]+""".r 
     val name = not(keywords)~"""[a-zA-Z][a-zA-Z\,\-\'\&\(\)]+\s+""".r 

     def possibleCompanyExts = companyExt <~ (dot *) ^^ {_.toString.trim} 
     def alphabetsExt = ((alphabets ~ ((quote | ampersand | hyphen | brackets | underscore | comma) *) <~ (space *))+) ^^ { case a => a.toString.trim} 
     def companyNameExt = (alphabetsExt <~ (space *) <~ (possibleCompanyExts+)) ^^ {_.toString 
     } 
     def companyName = alphabetsExt * 
     def entityName = (alphabetsExt+) ^^ {case l => l.map(s => s.trim).mkString(" ")} 
     def dateWithEndingChars = date <~ ((comma | quote | dot | newline) *) <~ (space *) ^^ {_.toInt} 
     def monthWithEndingChars = months <~ ((comma | quote | dot | newline) *) <~ (space *) ^^ { _.toString} 
     def monthWithDate = monthWithEndingChars ~ dateWithEndingChars ^^ { case a~b => MonthYear(a,b)} 
     def monthDateRange = monthWithDate ~ (space *) ~ toTillUntil ~ (space *) ~ monthWithDate ^^ { case a~s1~b~s2~c => List(a,c)} 
     def companyWithMonthDateRange = (companyNameExt ~ (space *) ~ monthDateRange) ^^ { 
     case a~b~c => CompanyWithMonthDateRange(company = a, dateRange = c, position = "") 
     } 
     def positionWithCompanyWithMonthDateRange = ((name+) ~ (space *) ~ forWithAt ~ (space *) ~ companyWithMonthDateRange) ^^ {    
     case a~s1~b~s2~c => c.copy(position = a.mkString(",")) 

     } 
    def apply(input:String) =  { 
     parseAll(positionWithCompanyWithMonthDateRange,input) match { 
     case Success(lup,_) => println(lup) 
     case x => println(x) 
     } 
     } 
    } 

输出写的代码应该像



    CompanyWithMonthDateRange(List(((()~Company)~List()), ((()~fd)~List()), ((()~India)~List('))),(()~Manager,),(()~Delhi),List(MonthYear(mar,2010), MonthYear(jul,2012))) 

此外,如何删除不需要的“〜”出现在上面的文本中。

感谢, 爬完

回答

0

我不想写这个作为一个完整的解决方案,你真正的问题,只是为了解析句子翻译成您所提供的数据结构,我不知道这是否帮助,只是作为参考。

在你的CompanyWithMonthDateRange,我没有看到把提取的名字放在哪里,所以,我会放弃它,它应该是微不足道的添加它。

object CompParser extends RegexParsers { 
    val For = "[for]" 
    val From = "[from]" 
    val To = "[to]" 
    val Keyword = For | From | To 
    val Def = """(?m)(?<=^|\]).*?(?=\[|(\.\s*[\n\r]+))""".r 
    val End = """.""".r 
    val Construct = opt(Def) ~ Keyword ~ Def ^^ { 
    case p ~ `For` ~ s => { 
     val arr = p.getOrElse("").split(",") 
     val t2 = if (arr.length == 2) arr(0) -> arr(1) else ("", "") 
     ("pos&com", (t2._1, s.toString)) 
    } 
    case p ~ `From` ~ s => { 
     val arr = s split "," 
     val t2 = if (arr.length == 2) arr(0) -> arr(1) else ("", "") 
     ("from", (t2._1, t2._2)) 
    } 
    case p ~ `To` ~ s => { 
     val arr = s split "," 
     val t2 = if (arr.length == 2) arr(0) -> arr(1) else ("", "") 
     ("to", (t2._1, t2._2)) 
    } 
    } 
    val Statement = rep(Construct) ^^ (Map() ++ _) ^^ { m => 
    if (m.size == 3) { 
     val from = new MonthYear(m.get("from").head._1, m.get("from").head._2.trim.toInt) 
     val to = new MonthYear(m.get("to").head._1, m.get("to").head._2.trim.toInt) 
     val pos = m.get("pos&com").head._1 
     val com = m.get("pos&com").head._2 
     new Some(CompanyWithMonthDateRange(com, pos, List(from, to))) 
    } else None 
    } 

    val Statements = rep(Statement <~ End) 

    def apply(in: String) = { 
    parseAll(Statements, in) match { 
     case Success(r, i) => println(r) 
     case failure => failure 
    } 
    } 
} 

和分析器在换行符站,这里的解析器测试:

object TestP extends App { 
    val inStr1 = """ 
    Manager, Delhi [for] The Company Pvt Ltd. [from] Jan, 2009 [to] Jan, 2012. 
    """ 
    val inStr2 = """ 
    Manager, Delhi [for] The Company Pvt Ltd. [from] Jan, 2009 [to] Jan, 2012. 
    Employee, Kate [for] The Company Pvt Ltd. [from] Feb, 2010 [to] Jun, 2012. 
    HR, Jane  [for] The Company Pvt Ltd. [from] May, 2010 [to] July, 2012. 
    """ 
    CompParser(inStr1) 
    CompParser(inStr2) 
} 

输出为: inStr1:

名单(部分(CompanyWithMonthDateRange(本公司经理,名单(MonthYear(2009年1月),MonthYear(2012年1月)))))

inStr2:

名单(部分(CompanyWithMonthDateRange(本公司私人有限公司 ,经理,名单(MonthYear(2009年1月),MonthYear(一月,2012)))), 一些(CompanyWithMonthDateRange( The Company Pvt Ltd. ,Employee,List(MonthYear(Feb,2010),MonthYear(Jun,2012)))), Some(CompanyWithMonthDateRange(The Company Pvt Ltd. ,HR,List(MonthYear(May,2010) ,MonthYear(July,2012)))))