2012-10-19 44 views
1

有一个以下的HTML,请问我可以如何使用JSoup从"<html""<a id="summary"></a>"的文本,尝试了以下正则表达式,但它返回的是空字符串。基于使用JSoup的标签范围的HTML搜索

doc.select("*:matches(^[<html]*[a>]$)")

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> 
<html xmlns="http://www.w3.org/1999/xhtml"> 
<head> 
<title>TestNG: Unit Test</title> 
</head> 
<body> 
<a id="summary"></a> 

<table cellspacing=0 cellpadding=0 class="param" style="float: left; width:630px;"> 
<tr><th>Test</th><th class="numi">Methods<br/>Passed</th><th class="numi">Scenarios<br/>Passed</th><th class="numi"># skipped</th><th class="numi"># failed</th><th class="numi">Total<br/>Time</th><th class="numi">Included<br/>Groups</th><th class="numi">Excluded<br/>Groups</th></tr> 
</table> 


</body></html> 

回答

0

这有点棘手,因为你必须首先遍历深度的DOM。 A NodeTraversor可以让你做到这一点。

下面是一个例子:

package stuff; 

import org.jsoup.Jsoup; 
import org.jsoup.nodes.Document; 
import org.jsoup.nodes.Element; 
import org.jsoup.nodes.Node; 
import org.jsoup.select.Elements; 
import org.jsoup.select.NodeTraversor; 
import org.jsoup.select.NodeVisitor; 

public class A { 

    public static void main(String[] args) { 
     String html = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" + 
       "<html xmlns=\"http://www.w3.org/1999/xhtml\">" + 
       "<head>" + 
       "<title>TestNG: Unit Test</title>" + 
       "</head>" + 
       "<body>" + 
       "<a id=\"summary\"></a>" + 
       "<table cellspacing=0 cellpadding=0 class=\"param\" style=\"float: left; width:630px;\">" + 
       "<tr><th>Test</th><th class=\"numi\">Methods<br/>Passed</th><th class=\"numi\">Scenarios<br/>Passed</th><th class=\"numi\"># skipped</th><th class=\"numi\"># failed</th><th class=\"numi\">Total<br/>Time</th><th class=\"numi\">Included<br/>Groups</th><th class=\"numi\">Excluded<br/>Groups</th></tr>" + 
       "</table>" + 
       "</body>" + 
       "</html>"; 
     System.out.println(parse(html)); 
     String html2 = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" + 
       "<html xmlns=\"http://www.w3.org/1999/xhtml\">" + 
       "<head>" + 
       "<title>TestNG: Unit Test</title>" + 
       "</head>" + 
       "<body>" + 
       "<a id=\"something_else\"></a>" + 
       "<a id=\"summary\"></a>" + 
       "<table cellspacing=0 cellpadding=0 class=\"param\" style=\"float: left; width:630px;\">" + 
       "<tr><th>Test</th><th class=\"numi\">Methods<br/>Passed</th><th class=\"numi\">Scenarios<br/>Passed</th><th class=\"numi\"># skipped</th><th class=\"numi\"># failed</th><th class=\"numi\">Total<br/>Time</th><th class=\"numi\">Included<br/>Groups</th><th class=\"numi\">Excluded<br/>Groups</th></tr>" + 
       "</table>" + 
       "</body>" + 
       "</html>"; 
     System.out.println(parse(html2)); 
    } 

    public static String parse(String html) { 
     Document document = Jsoup.parse(html); 
     final StringBuffer buffer = new StringBuffer(); 
     NodeTraversor nd = new NodeTraversor(new NodeVisitor() { 

      private boolean finished = false; 

      @Override 
      public void tail(Node node, int depth) { 
       if (!finished && node instanceof Element) { 
        Element element = (Element) node; 
        if ("a".equals(element.tagName()) && element.hasAttr("id") 
          && "summary".equals(element.attr("id"))) 
         finished = true; 
        else 
         buffer.append(element.toString()); 
       } 
      } 

      @Override 
      public void head(Node arg0, int arg1) { 
      } 
     }); 
     buffer.append(document.head().html()); 
     buffer.append("<body>"); 
     nd.traverse(document.body()); 
     return buffer.toString(); 
    } 
} 

这不是非常漂亮(尤其是做buffer.append("<body>");时)... ...但很快:)

this answer参见相关的例子。

0

我不知道,但你可以试试这个.. 当触发元素“”标签将停止在边界循环

Elements doc=select("what u want"); 
String dummy=""; 
for (Element e:doc){ 

    if (dummy.isEmpty()){ 
     System.out.println(e); 
     if (e.tagName().matches("a")){ 
      dummy=e.tagName(); 
     } 

    } 
}