2017-05-08 42 views
1
<?xml version="1.0"?> 
<catalog> 
    <book id="bk001" type='fiction'> 
     <author>Gambardella, Matthew</author> 
     <author>Doe, John</author> 
     <title>XML IN-DEPT Developer's Guide</title> 
     <genre>Computer</genre> 
     <price>44.95</price> 
     <snippet> 
      <inlineXML contenttype="application/xhtml+xml" > 
       <html lang="en-US" > 
        <head> 
         <title>XML IN-DEPT Developer's Guide</title> 
        </head> 
        <body> 
         <p>This is an example book for developers want to gain knowledge on <span class="boldcls" type="xml" >XML</span> Marshalling and UnMarshalling. Need to know all about <span class="boldcls" type="tech" >XML parsing and editing</span>, Grab this Book!</p> 
        </body> 
       </html> 
      </inlineXML> 
     </snippet> 
    </book> 
</catalog> 

后得到上面的文字是XML样本,我想评估XPath表达式“/书/片断”,并遍历所有元素,并获取文本。我正在使用这个(https://stackoverflow.com/a/21279523/1297935)修改后的代码(如下面的UPDATE中所述)使用VTD-XML库,但问题是它在遇到span标记后没有得到我的文本。所以输出现在我得到的段落标记是:VTD-XML - 不能跨标签

Level [6] Tag [p] 
      This is an example book for developers want to gain knowledge on 
    Level [7] Tag [span] @class=boldcls 
      XML 
    Level [8] Tag [span] @class=boldcls 
      XML parsing and editing 

哪项是错误的,因为它应该是:

Level [6] Tag [p] 
      This is an example book for developers want to gain knowledge on XML Marshalling and UnMarshalling. Need to know all about XML parsing and editing, Grab this Book! 
    Level [7] Tag [span] @class=boldcls 
      XML 
    Level [8] Tag [span] @class=boldcls 
      XML parsing and editing 

UPDATE: 我已经修改了代码示例了一下:

上面的代码的
package com.vtd.test; 

import java.io.ByteArrayOutputStream; 
import java.io.File; 
import java.io.IOException; 
import java.util.ArrayList; 
import java.util.LinkedHashMap; 
import java.util.List; 
import java.util.Map; 

import javax.xml.parsers.DocumentBuilder; 
import javax.xml.parsers.DocumentBuilderFactory; 
import javax.xml.transform.Transformer; 
import javax.xml.transform.TransformerFactory; 
import javax.xml.transform.dom.DOMSource; 
import javax.xml.transform.stream.StreamResult; 

import org.w3c.dom.Document; 

import com.ximpleware.AutoPilot; 
import com.ximpleware.NavException; 
import com.ximpleware.VTDGen; 
import com.ximpleware.VTDNav; 
import com.ximpleware.XPathEvalException; 
import com.ximpleware.XPathParseException; 

public class VTDXMLReader { 

    // private String xpathExpression; 

    private VTDNav vtdNav; 

    private AutoPilot autoPilot; 

    private boolean includeAttributes; 

    private String attribute; 

    public VTDXMLReader(final Document storyDoc, final boolean includeAttributes, final String xpathExpression) { 
     this.includeAttributes = includeAttributes; 
     // this.xpathExpression = xpathExpression; 
     final VTDGen vtdGen = new VTDGen(); 
     try { 
      ByteArrayOutputStream baos = new ByteArrayOutputStream(); 
      TransformerFactory transformerFactory = TransformerFactory.newInstance(); 
      Transformer transformer = transformerFactory.newTransformer(); 
      DOMSource source = new DOMSource(storyDoc); 
      StreamResult result = new StreamResult(baos); 
      transformer.transform(source, result); 
      byte[] array = baos.toByteArray(); 

      vtdGen.setDoc(array); 
      vtdGen.parse(true); 
     } catch (Exception ex) { 
      ex.printStackTrace(); 
     } 
     vtdNav = vtdGen.getNav(); 
     autoPilot = new AutoPilot(vtdNav); 
     String[] xpathFrags = xpathExpression.split("/"); 
     if (xpathFrags[xpathFrags.length - 1].startsWith("@")) { 
      attribute = xpathFrags[xpathFrags.length - 1].replaceAll("@", ""); 
     } 
     try { 
      autoPilot.selectXPath(xpathExpression); 
     } catch (XPathParseException e) { 
      e.printStackTrace(); 
     } 
    } 

    public List<String> readXML() throws IOException { 
     List<String> values = new ArrayList<String>(); 
     try { 
      while (autoPilot.evalXPath() != -1) { 
       // printTag(vn); 
       if (includeAttributes) { 
        Map<String, String> amap = new LinkedHashMap<String, String>(); 

        loadAttributeMap(vtdNav, amap); 

        for (String aname : amap.keySet()) { 
         String aval = amap.get(aname); 
         values.add(aval); 
         // System.out.print(" @" + aname + "=" + aval); 
        } 
        // System.out.print("\n"); 
       } 
       int val = 0; 
       if (attribute != null && !attribute.isEmpty()) { 
        val = vtdNav.getAttrVal(attribute); 
        if (val != -1) { 
         String id = vtdNav.toNormalizedString(val); 
         values.add(id); 
         // System.out.println("Attribute: " + id); 
        } 
       } 
       val = vtdNav.getText(); 
       if (val != -1) { 
        String author = vtdNav.toNormalizedString(val); 
        values.add(author); 
        // System.out.println("\t" + author); 
       } 
       navigateToChildren(vtdNav, includeAttributes, values); 

      } 
      // autoPilot.resetXPath(); 
     } catch (Exception ex) { 
      ex.printStackTrace(); 
     } 

     return values; 

    } 

    public static void navigateToChildren(final VTDNav vn, final boolean includeAttributes, List<String> values) { 
     try { 
      vn.push(); 
      if (vn.toElement(VTDNav.FIRST_CHILD)) { 
       do { 
        // printTag(vn); 

        if (includeAttributes) { 
         Map<String, String> amap = new LinkedHashMap<String, String>(); 

         loadAttributeMap(vn, amap); 

         for (String aname : amap.keySet()) { 
          String aval = amap.get(aname); 
          values.add(aval); 
          // System.out.print(" @" + aname + "=" + aval); 
         } 
         // System.out.print("\n"); 
        } 

        int val = vn.getText(); 
        if (val != -1) { 
         String author = vn.toNormalizedString(val); 
         values.add(author); 
         // System.out.println("\t" + author); 
        } 
        navigateToChildren(vn, includeAttributes, values); 
       } while (vn.toElement(VTDNav.NEXT_SIBLING)); 
      } 
      vn.toElement(VTDNav.PARENT); 
      vn.pop(); 
     } catch (Exception e) { 
      e.printStackTrace(); 
     } 
    } 

    private static void loadAttributeMap(VTDNav nav, Map<String, String> amap) { 

     nav.push(); 

     try { 
      AutoPilot apAtt = new AutoPilot(nav); 
      apAtt.selectXPath("@*"); 

      int j = -1; 
      while ((j = apAtt.evalXPath()) != -1) { 
       String name = nav.toString(j); 
       String val = nav.toString(j + 1); 

       amap.put(name, val); 
      } 
     } catch (XPathParseException | XPathEvalException | NavException e) { 
      e.printStackTrace(); 
     } 

     nav.pop(); 
    } 

    public static void main(String[] args) { 
     try { 
      DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); 
      DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); 
      Document document = dBuilder.parse(new File("books.xml")); 

      VTDXMLReader vtdxmlReader = new VTDXMLReader(document, false, "/catalog/book/snippet"); 
      List<String> xmlFrags = vtdxmlReader.readXML(); 
      for (String xmlFrag : xmlFrags) { 
       System.out.println(xmlFrag); 
      } 
     } catch (Exception e) { 
      e.printStackTrace(); 
     } 
    } 

} 

输出是:

XML IN-DEPT Developer's Guide 
This is an example book for developers want to gain knowledge on 
XML 
XML parsing and editing 

本应是:

XML IN-DEPT Developer's Guide 
This is an example book for developers want to gain knowledge on 
XML 
Marshalling and UnMarshalling. Need to know all about 
XML parsing and editing 
, Grab this Book! 

任何想法?

我想要做什么: 如果下面是在HTML文档段落标记:

<p>This is an example book for developers want to gain knowledge on <span class="boldcls" type="xml" >XML</span> Marshalling and UnMarshalling. Need to know all about <span class="boldcls" type="tech" >XML parsing and editing</span>, Grab this Book!</p> 

我想写一个阅读器,从左至右包括属性值读取它,像通过线以下行:

==> This is an example book for developers want to gain knowledge on 
==> boldcls xml XML 
==> Marshalling and UnMarshalling. Need to know all about 
==> boldcls tech XML parsing and editing 
==> , Grab this Book! 

目前我做这个用的XMLEventReader,我想用VTD-XML库代码来替换。

+0

您可以显示您正在使用的xpath吗? – SomeDude

+0

我使用这段代码:vp.loadFile(“books.xml”); vp.getElementsByXpath(“/ catalog/book/snippet”); vp.parseAndPrint(); – dev009

+0

我可以看看你的代码吗?你可以发布吗? –

回答

1

我对你的navigateToChildren子程序做了轻微的修改...我调用了VTDNav的getXPathStringVal()来获取所有文本节点......基本上,问题是getText()对数据中心的xml文档工作正常。对于以文档为中心的用例,您应该调用getXPathStringVal()方法直接提取文本节点...此方法在较新版本的vtd-xml中可用。这是你想要的?

public static void navigateToChildren(final VTDNav vn, final boolean includeAttributes, List<String> values) { 
     try { 
      vn.push(); 
      if (vn.toElement(VTDNav.FIRST_CHILD)) { 
       do { 
        //printTag(vn); 

        if (includeAttributes) { 
         Map<String, String> amap = new LinkedHashMap<String, String>(); 

         loadAttributeMap(vn, amap); 

         for (String aname : amap.keySet()) { 
          String aval = amap.get(aname); 
          values.add(aval); 
          System.out.print(" ==>@" + aname + "=" + aval); 
         } 
         // System.out.print("\n"); 
        } 

        int val = vn.getText(); 

        if (val != -1) { 
         String author = vn.getXPathStringVal(); 
         values.add(author); 
         System.out.println("==>\t" + author); 
        } 
        navigateToChildren(vn, includeAttributes, values); 
       } while (vn.toElement(VTDNav.NEXT_SIBLING)); 
      } 
      vn.toElement(VTDNav.PARENT); 
      vn.pop(); 
     } catch (Exception e) { 
      e.printStackTrace(); 
     } 
    } 

第二个编辑:我写了一个小的应用程序,完成了所有的下级文字和attR VAL级联。基本上这直接访问使用索引值的基础VTD缓冲区...和扫描通VTD中依次记录。如果令牌类型是属性值或字符数据,则应用程序会将其附加到字符串缓冲区中...

import com.ximpleware.*; 

public class collectTokens { 
    public static void main(String[] s) throws VTDException{ 
     VTDGen vg = new VTDGen(); 
     if (!vg.parseFile("d:\\xml\\books.xml", true)){ 
      return; 
     } 
     VTDNav vn = vg.getNav(); 
     AutoPilot ap = new AutoPilot(vn); 
     ap.selectXPath("/catalog/book/snippet/inlineXML/html/body/p"); 
     int i=ap.evalXPath(); 
     // i points to the p element node 
     if (i!=-1){ 
      int j = vn.getCurrentIndex();// get the token index of p 
      int d = vn.getTokenDepth(j); 
      int count = vn.getTokenCount(); 
      int index=j+1; 
      // collect the text of all text and attr vals sequentially 
      StringBuilder sb = new StringBuilder(50); 
      while((index<count)){ 
       if (vn.getTokenDepth(index)==d 
         && vn.getTokenDepth(index)== VTDNav.TOKEN_STARTING_TAG) 
        break; 
       if (vn.getTokenType(index)== VTDNav.TOKEN_CHARACTER_DATA 
         || vn.getTokenType(index)==VTDNav.TOKEN_ATTR_VAL){ 
          sb.append(vn.toString(index)+" "); 
         } 
       index++; 
      } 
      System.out.println(sb); 
     } 
    } 
} 
+0

谢谢@ vtd-xml-author。这似乎与我所需要的非常接近。这就是我需要: '==> \t XML IN-DEPT开发者指南 ==> \t开发商想获得关于 ==> \t XML ==> \t打包和解包所知,这是一个例子书。需要知道全部关于 ==> \t XML解析和编辑 ==> \t,抓斗本书!' 可以这样做吗? – dev009

+0

再次感谢@ vtd-xml-author。我在我的问题中添加了“我想做的事”。我不知道这是否可以使用VTD-XML来完成。 – dev009