2017-08-11 59 views
1

我想解析来自Web API的XML响应。在R中解析嵌套的XML(带名称空间)

对于如下简单的xml,我可以使用xpathSApply并很容易地获取相关数据。

以下是的example.xml

<?xml version="1.0" encoding="UTF-8"?> 
<CATALOG> 
    <PLANT> 
     <COMMON>Bloodroot</COMMON> 
     <BOTANICAL>Sanguinaria canadensis</BOTANICAL> 
     <ZONE>4</ZONE> 
     <LIGHT>Mostly Shady</LIGHT> 
     <PRICE>$2.44</PRICE> 
     <AVAILABILITY>031599</AVAILABILITY> 
    </PLANT> 
    <PLANT> 
     <COMMON>Columbine</COMMON> 
     <BOTANICAL>Aquilegia canadensis</BOTANICAL> 
     <ZONE>3</ZONE> 
     <LIGHT>Mostly Shady</LIGHT> 
     <PRICE>$9.37</PRICE> 
     <AVAILABILITY>030699</AVAILABILITY> 
    </PLANT> 
</CATALOG> 

>library(XML) 
>doc<-xmlTreeParse("example.xml",useInternal=TRUE) 
>rootNode<-xmlRoot(doc) 
>xpathSApply(rootNode,"//COMMON",xmlValue) 
[1] "Bloodroot" "Columbine" 

> getNodeSet(doc,"//PLANT") 
[[1]] 
<PLANT> 
    <COMMON>Bloodroot</COMMON> 
    <BOTANICAL>Sanguinaria canadensis</BOTANICAL> 
    <ZONE>4</ZONE> 
    <LIGHT>Mostly Shady</LIGHT> 
    <PRICE>$2.44</PRICE> 
    <AVAILABILITY>031599</AVAILABILITY> 
</PLANT> 

[[2]] 
<PLANT> 
    <COMMON>Columbine</COMMON> 
    <BOTANICAL>Aquilegia canadensis</BOTANICAL> 
    <ZONE>3</ZONE> 
    <LIGHT>Mostly Shady</LIGHT> 
    <PRICE>$9.37</PRICE> 
    <AVAILABILITY>030699</AVAILABILITY> 
</PLANT> 

attr(,"class") 
[1] "XMLNodeSet" 

> xmlSApply(getNodeSet(rootNode,"//PRICE"),xmlValue) #provides a list of all PRICE values in the xml 
[1] "$2.44" "$9.37" 

然而,同样的命令没有为以下XML具有命名空间的细节工作。无论如何,我可以获取节点/标签中的数据。

以下是example1.xml

<?xml version="1.0" encoding="UTF-8"?> 
<s:Envelope xmlns:s="http://schemas.xmlsoap.org/soap/envelope/" xmlns:u="http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-wssecurity-utility-1.0.xsd"><s:Body><GetByFilterTradeResponse xmlns="http://entrader.contigoenergy.com/Contigo.Entrader.Service"><GetByFilterTradeResult xmlns:i="http://www.w3.org/2001/XMLSchema-instance"> 
<CATALOG> 
    <CATEGORY> 
     <FAMILY> 
      <PLANT> 
       <COMMON>Bloodroot</COMMON> 
       <BOTANICAL>Sanguinaria canadensis</BOTANICAL> 
       <ZONE>4</ZONE> 
       <DETAILS> 
        <PRICEINBULK>2.3</PRICEINBULK> 
        <MINVOLUME>100</MINVOLUME> 
       </DETAILS> 
       <LIGHT>Mostly Shady</LIGHT> 
       <PRICE>$2.44</PRICE> 
       <AVAILABILITY>031599</AVAILABILITY> 
      </PLANT> 
      <PLANT> 
       <COMMON>Columbine</COMMON> 
       <BOTANICAL>Aquilegia canadensis</BOTANICAL> 
       <ZONE>3</ZONE> 
       <DETAILS> 
        <PRICEINBULK>9.00</PRICEINBULK> 
        <MINVOLUME>100</MINVOLUME> 
       </DETAILS> 
       <LIGHT>Mostly Shady</LIGHT> 
       <PRICE>$9.37</PRICE> 
       <AVAILABILITY>030699</AVAILABILITY> 
      </PLANT> 
     </FAMILY> 
    </CATEGORY> 
</CATALOG> 
</GetByFilterTradeResult></GetByFilterTradeResponse></s:Body></s:Envelope> 

以下命令不从上面的XML

>doc<-xmlTreeParse("example1.xml",useInternal=TRUE) 
>rootNode<-xmlRoot(doc) 
> xpathSApply(rootNode,"//COMMON",xmlValue) 
list() 

> getNodeSet(doc,"//PLANT") 
list() 
attr(,"class") 
[1] "XMLNodeSet" 

> xmlSApply(getNodeSet(rootNode,"//PRICE"),xmlValue) 
list() 

回答

1

使用name()local-name()在XPATH提取节点值:

library(XML) 

appText <- '<?xml version="1.0" encoding="UTF-8"?> 
<s:Envelope xmlns:s="http://schemas.xmlsoap.org/soap/envelope/" xmlns:u="http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-wssecurity-utility-1.0.xsd"> 
<s:Body><GetByFilterTradeResponse xmlns="http://entrader.contigoenergy.com/Contigo.Entrader.Service"> 
<GetByFilterTradeResult xmlns:i="http://www.w3.org/2001/XMLSchema-instance"> 
<CATALOG> 
<CATEGORY> 
<FAMILY> 
<PLANT> 
<COMMON>Bloodroot</COMMON> 
<BOTANICAL>Sanguinaria canadensis</BOTANICAL> 
<ZONE>4</ZONE> 
<DETAILS> 
<PRICEINBULK>2.3</PRICEINBULK> 
<MINVOLUME>100</MINVOLUME> 
</DETAILS> 
<LIGHT>Mostly Shady</LIGHT> 
<PRICE>$2.44</PRICE> 
<AVAILABILITY>031599</AVAILABILITY> 
</PLANT> 
<PLANT> 
<COMMON>Columbine</COMMON> 
<BOTANICAL>Aquilegia canadensis</BOTANICAL> 
<ZONE>3</ZONE> 
<DETAILS> 
<PRICEINBULK>9.00</PRICEINBULK> 
<MINVOLUME>100</MINVOLUME> 
</DETAILS> 
<LIGHT>Mostly Shady</LIGHT> 
<PRICE>$9.37</PRICE> 
<AVAILABILITY>030699</AVAILABILITY> 
</PLANT> 
</FAMILY> 
</CATEGORY> 
</CATALOG> 
</GetByFilterTradeResult></GetByFilterTradeResponse></s:Body></s:Envelope>' 
doc <- xmlParse(appText) 
> xpathSApply(doc,"//*[name()='COMMON']", xmlValue) 
[1] "Bloodroot" "Columbine" 

或者明确定义名称空间:

> xpathSApply(doc,"//n:COMMON",xmlValue, namespaces = 
+     c(s = "http://schemas.xmlsoap.org/soap/envelope/", 
+     n = "http://entrader.contigoenergy.com/Contigo.Entrader.Service", 
+     i = "http://www.w3.org/2001/XMLSchema-instance")) 
[1] "Bloodroot" "Columbine" 

或使用xml2包:

library(xml2) 
doc <- read_xml(appText) 
# check namespaces 
> xml_ns(doc) 
d1 <-> http://entrader.contigoenergy.com/Contigo.Entrader.Service 
i <-> http://www.w3.org/2001/XMLSchema-instance 
s <-> http://schemas.xmlsoap.org/soap/envelope/ 
u <-> http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-wssecurity-utility-1.0.xsd 

> xml_text(xml_find_all(doc, "//d1:COMMON")) 
[1] "Bloodroot" "Columbine" 
+0

非常感谢您@jdharrison。 –