2014-03-05 59 views
-1

我想从网站获取网址和网址。能够从网站获取网址和html,但是当一个网址包含多个元素(如多个输入元素(或)多个textarea元素)我能够只获得最后element.The代码像下面如何从网站获取HtmlElements

GetURLsAndElemens.java

public static void main(String[] args) throws FileNotFoundException, 
       IOException, ParseException { 

      Properties properties = new Properties(); 
      properties 
        .load(new FileInputStream(
          "src//io//servicely//ci//plugin//SeleniumResources.properties")); 
      Map<String, String> urls = gettingUrls(properties 
        .getProperty("MAIN_URL")); 
      GettingHTMLElements.getHTMLElements(urls); 
      // .out.println(urls.size()); 
      // System.out.println(urls); 
     } 

     public static Map<String, String> gettingUrls(String mainURL) { 
      Document doc = null; 
      Map<String, String> urlsList = new HashMap<String, String>(); 
      try { 
       System.out.println("Main URL " + mainURL); 

       // need http protocol 
       doc = Jsoup.connect(mainURL).get(); 
       GettingHTMLElements.getInputElements(doc, mainURL); 

       // get page title 
       // String title = doc.title(); 
       // System.out.println("title : " + title); 

       // get all links 
       Elements links = doc.select("a[href]"); 
       for (Element link : links) { 
        // urlsList.clear(); 

        // get the value from href attribute and adding to list 
        if (link.attr("href").contains("http")) { 
         urlsList.put(link.attr("href"), link.text()); 

        } else { 
         urlsList.put(mainURL + link.attr("href"), link.text()); 

        } 

        // System.out.println(urlsList); 
       } 

      } catch (IOException e) { 
       e.printStackTrace(); 
      } 
      // System.out.println("Total urls are "+urlsList.size()); 
      // System.out.println(urlsList); 
      return urlsList; 
     } 

GettingHtmlElements.java

static Map<String, HtmlElements> urlList = new HashMap<String, HtmlElements>(); 

    public static void getHTMLElements(Map<String, String> urls) 
      throws IOException { 

     getElements(urls); 

    } 

    public static void getElements(Map<String, String> urls) throws IOException { 

     for (Map.Entry<String, String> entry1 : urls.entrySet()) { 

      try { 

       System.out.println(entry1.getKey()); 

       Document doc = Jsoup.connect(entry1.getKey()).get(); 

       getInputElements(doc, entry1.getKey()); 

      } 

      catch (Exception e) { 
       e.printStackTrace(); 
      } 

     } 

     Map<String,HtmlElements> list = urlList; 
     for(Map.Entry<String,HtmlElements> entry1:list.entrySet()) 
     { 
      HtmlElements ele = entry1.getValue(); 
      System.out.println("url is "+entry1.getKey()); 
      System.out.println("input name "+ele.getInput_name()); 
     } 
    } 

    public static HtmlElements getInputElements(Document doc, String entry1) { 

     HtmlElements htmlElements = new HtmlElements(); 
     Elements inputElements2 = doc.getElementsByTag("input"); 
     Elements textAreaElements2 = doc.getElementsByTag("textarea"); 
     Elements formElements3 = doc.getElementsByTag("form"); 

     for (Element inputElement : inputElements2) { 
      String key = inputElement.attr("name"); 
      htmlElements.setInput_name(key); 
      String key1 = inputElement.attr("type"); 
      htmlElements.setInput_type(key1); 
      String key2 = inputElement.attr("class"); 
      htmlElements.setInput_class(key2); 

     } 
     for (Element inputElement : textAreaElements2) { 
      String key = inputElement.attr("id"); 
      htmlElements.setTextarea_id(key); 
      String key1 = inputElement.attr("name"); 
      htmlElements.setTextarea_name(key1); 

        } 
     for (Element inputElement : formElements3) { 
      String key = inputElement.attr("method"); 
      htmlElements.setForm_method(key); 
      String key1 = inputElement.attr("action"); 
      htmlElements.setForm_action(key1); 


     } 

     return urlList.put(entry1, htmlElements); 

    } 

我想t内元素ake它作为一个bean.For每个url我得到的URL和htmle elements.but当url包含多个元素我得到的最后一个元素只有

回答

0

您使用类别HtmlElements这是不是JSoup的一部分据我所知。我不知道它的内部工作原理,但我认为它是某种html节点或某种列表。

但是,你似乎使用这个类是这样的:

HtmlElements htmlElements = new HtmlElements(); 
htmlElements.setInput_name(key); 

这表明,只有ONE HTML元素存储在HTML元素的变量。这可以解释为什么你只能得到最后一个元素 - 你只需要覆盖一个实例。

这是不是很清楚,因为我不知道HtmlElements类。也许是这样的工作原理,假设HtmlElement正在为HtmlElementsHtmlElements一个实例有一个方法add

HtmlElements htmlElements = new HtmlElements(); 
... 
for (Element inputElement : inputElements2) { 
    HtmlElement e = new HtmlElement(); 
    htmlElements.add(e); 
    String key = inputElement.attr("name"); 
    e.setInput_name(key); 
}