2013-05-07 188 views
0

我需要弄清楚如何刮网站和从认证的网站下载文件。自动登录/从网站刮文件

脚本需要

  1. 登录到该网站使用用户名/密码,
  2. 浏览过的网页去下载页面
  3. 设置形式某些领域和击中下载按钮
  4. 保存下载的文件

我一直在寻找Jsoup(因为Java是我的首选),但可以也尝试scrapy等。但我需要了解这些是否通常完成,以及是否有其他技术来实现这一点。 我可以使用Selenium之类的东西来设置它,但是我不希望使用浏览器作为UA的工具,因为这会带来巨大的额外开销。 我到了某个地方,但整个cookie管理变得非常混乱。

感谢, 维韦克

回答

1

如果你描述周围有使用真正的浏览器没有办法需要大量与网页互动的 - 至少从我的经验。然而,硒webdriver与phantomjs很好,所以开销不是太大。

正如在下面的评论中指出的那样,您也可以使用类似mechanize的东西,但是当JavaScript改变页面上的DOM时,这些解决方案往往是无用的。 (请参阅http://wwwsearch.sourceforge.net/mechanize/faq.html#script

+0

不是真的,任何类似机械化的库都可以做到这一点。 – pguardiario 2013-05-08 00:45:58

+0

嗯,没错。我调整了我的答案。 – luksch 2013-05-08 07:43:02

+0

完全没有用处,你可能需要弄清楚一些ajax调用,但以我的经验来说,这比硒伴随的头痛更好。 – pguardiario 2013-05-08 08:39:17

0

我建议您使用Fiddler2并像平常一样浏览网站。

一旦你完成了它,你应该能够轻松地复制所需的页面调用以及任何Javascript可能用最少的大惊小怪和代码完成的事情。

我倾向于使用低于一次下载许多形式的网页,并将其保存为登录网站等饼干:

function Download($href) 
    { 

     curl_setopt($this->ch, CURLOPT_COOKIEJAR, COOKIE_FILE); // Cookie management. 
     curl_setopt($this->ch, CURLOPT_COOKIEFILE, COOKIE_FILE); 
     curl_setopt($this->ch, CURLOPT_TIMEOUT, CURL_TIMEOUT); // Timeout 
     curl_setopt($this->ch, CURLOPT_USERAGENT, WEBBOT_NAME); // Webbot name 
     curl_setopt($this->ch, CURLOPT_VERBOSE, FALSE);   // Minimize logs 
     curl_setopt($this->ch, CURLOPT_SSL_VERIFYPEER, FALSE); // No certificate 
     curl_setopt($this->ch, CURLOPT_FOLLOWLOCATION, TRUE);  // Follow redirects 
     curl_setopt($this->ch, CURLOPT_MAXREDIRS, 4);    // Limit redirections to four 
     curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, TRUE);  // Return in string 
     curl_setopt($this->ch, CURLOPT_URL, $href);    // Target site 
     curl_setopt($this->ch, CURLOPT_REFERER, $href);   // Referer value 
     curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, true); 

     # Create return arrays 
     $return_array['FILE'] = curl_exec($this->ch); 
     $return_array['STATUS'] = curl_getinfo($this->ch); 
     $return_array['ERRORS'] = curl_error($this->ch); 
     $dom_document = new DOMDocument(); 
     @$dom_document->loadHTML($return_array['FILE']); 
     $return_array['DOM'] = new DOMXpath($dom_document); 


     return $return_array; 
    } 

这是我HttpHelper类。易于使用和它只是HTML:

<?php 
class HttpHelper { 


    function __construct() { 
    //setcookie("UserPostcode","2065",time() + 3600); 
     $this->ch = curl_init(); 
     define("WEBBOT_NAME", "Test Webbot"); 
     # Length of time cURL will wait for a response (seconds) 
     define("CURL_TIMEOUT", 25); 
     # Location of your cookie file. (Must be fully resolved local address) 
     define("COOKIE_FILE", "cookie.txt"); 
     # DEFINE METHOD CONSTANTS 
     define("HEAD", "HEAD"); 
     define("GET", "GET"); 
     define("POST", "POST"); 
     # DEFINE HEADER INCLUSION 
     define("EXCL_HEAD", FALSE); 
     define("INCL_HEAD", TRUE); 


     $header = array(); 
     $header[0] = "Accept: text/xml,application/xml,application/xhtml+xml,"; 
     $header[0] .= "text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"; 
     $header[] = "Cache-Control: max-age=0"; 
     $header[] = "Connection: keep-alive"; 
     $header[] = "Keep-Alive: 300"; 
     $header[] = "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7"; 
     $header[] = "Accept-Language: en-us,en;q=0.5"; 
     $header[] = "Pragma: "; // browsers keep this blank. 

     curl_setopt($this->ch, CURLOPT_HTTPHEADER, $header);  // Set Header Information 

    } 

    // Collects the HTML, Status, Errors and a DOM. 
    function Download($href) 
    { 

     curl_setopt($this->ch, CURLOPT_COOKIEJAR, COOKIE_FILE); // Cookie management. 
     curl_setopt($this->ch, CURLOPT_COOKIEFILE, COOKIE_FILE); 
     curl_setopt($this->ch, CURLOPT_TIMEOUT, CURL_TIMEOUT); // Timeout 
     curl_setopt($this->ch, CURLOPT_USERAGENT, WEBBOT_NAME); // Webbot name 
     curl_setopt($this->ch, CURLOPT_VERBOSE, FALSE);   // Minimize logs 
     curl_setopt($this->ch, CURLOPT_SSL_VERIFYPEER, FALSE); // No certificate 
     curl_setopt($this->ch, CURLOPT_FOLLOWLOCATION, TRUE);  // Follow redirects 
     curl_setopt($this->ch, CURLOPT_MAXREDIRS, 4);    // Limit redirections to four 
     curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, TRUE);  // Return in string 
     curl_setopt($this->ch, CURLOPT_URL, $href);    // Target site 
     curl_setopt($this->ch, CURLOPT_REFERER, $href);   // Referer value 
     curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, true); 

     # Create return arrays 
     $return_array['FILE'] = curl_exec($this->ch); 
     $return_array['STATUS'] = curl_getinfo($this->ch); 
     $return_array['ERRORS'] = curl_error($this->ch); 
     $dom_document = new DOMDocument(); 
     @$dom_document->loadHTML($return_array['FILE']); 
     $return_array['DOM'] = new DOMXpath($dom_document); 


     return $return_array; 
    } 

    function http_post_form($target, $ref, $data_array) 
    { 
    return $this->http($target, $ref, $method="POST", $data_array, EXCL_HEAD); 
    } 

function http_post_withheader($target, $ref, $data_array) 
    { 
    return http($target, $ref, $method="POST", $data_array, INCL_HEAD); 
    } 

    function http($target, $ref, $method, $data_array, $incl_head) 
    { 
    # Initialize PHP/CURL handle 
    $ch = curl_init(); 

    # Prcess data, if presented 
    if(is_array($data_array)) 
     { 
     # Convert data array into a query string (ie animal=dog&sport=baseball) 
     foreach ($data_array as $key => $value) 
      { 
      if(strlen(trim($value))>0) 
       $temp_string[] = $key . "=" . urlencode($value); 
      else 
       $temp_string[] = $key; 
      } 
     $query_string = join('&', $temp_string); 
     }else{ 
      $query_string =$data_array; 
     } 

    # HEAD method configuration 
    if($method == HEAD) 
     { 
     curl_setopt($ch, CURLOPT_HEADER, TRUE);    // No http head 
     curl_setopt($ch, CURLOPT_NOBODY, TRUE);    // Return body 
     } 
    else 
     { 
     # GET method configuration 
     if($method == GET) 
      { 
      if(isset($query_string)) 
       $target = $target . "?" . $query_string; 
      curl_setopt ($ch, CURLOPT_HTTPGET, TRUE); 
      curl_setopt ($ch, CURLOPT_POST, FALSE); 
      } 
     # POST method configuration 
     if($method == POST) 
      { 
      if(isset($query_string)) 
       curl_setopt ($ch, CURLOPT_POSTFIELDS, $query_string); 
      curl_setopt ($ch, CURLOPT_POST, TRUE); 
      curl_setopt ($ch, CURLOPT_HTTPGET, FALSE); 
      } 
     curl_setopt($ch, CURLOPT_HEADER, $incl_head); // Include head as needed 
     curl_setopt($ch, CURLOPT_NOBODY, FALSE);  // Return body 
     } 

    curl_setopt($ch, CURLOPT_COOKIEJAR, COOKIE_FILE); // Cookie management. 
    curl_setopt($ch, CURLOPT_COOKIEFILE, COOKIE_FILE); 
    curl_setopt($ch, CURLOPT_TIMEOUT, CURL_TIMEOUT); // Timeout 
    curl_setopt($ch, CURLOPT_USERAGENT, WEBBOT_NAME); // Webbot name 
    curl_setopt($ch, CURLOPT_URL, $target);    // Target site 
    curl_setopt($ch, CURLOPT_REFERER, $ref);   // Referer value 
    curl_setopt($ch, CURLOPT_VERBOSE, FALSE);   // Minimize logs 
    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE); // No certificate 
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);  // Follow redirects 
    curl_setopt($ch, CURLOPT_MAXREDIRS, 4);    // Limit redirections to four 
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);  // Return in string 

    # Create return array 
    $return_array['FILE'] = curl_exec($ch); 
    $return_array['STATUS'] = curl_getinfo($ch); 
    $return_array['ERROR'] = curl_error($ch); 

    # Close PHP/CURL handle 
    curl_close($ch); 

    # Return results 
    return $return_array; 
    } 

    function InnerHtml($element) 
    { 
     $innerHTML = ""; 
     if($element != NULL && $element->hasChildNodes()) 
     { 
      $children = $element->childNodes; 
      foreach ($children as $child) 
      { 
       $tmp_dom = new DOMDocument(); 
       $tmp_dom->appendChild($tmp_dom->importNode($child, true)); 
       $innerHTML.=trim($tmp_dom->saveHTML()); 
      } 
     } 
     return $innerHTML; 
    } 


    function Split($data, $split) 
    { 
     return explode($split, $data); 
    } 

    function correctImgUrls($html, $url) 
    { 
     $DOM = new DOMDocument; 
     $DOM->loadHTML($html); 

     $imgs = $DOM->getElementsByTagName('img'); 
     foreach($imgs as $img){ 
      $src = $img->getAttribute('src'); 
      if(strpos($src, $url) !== 0){ 
       $img->setAttribute('src', $url.$src); 
      } 
     } 

     $html = $DOM->saveHTML(); 
     return $html; 
    } 

    function correctUrls($html, $url) 
    { 
     $DOM = new DOMDocument; 
     $DOM->loadHTML($html); 

     $imgs = $DOM->getElementsByTagName('a'); 
     foreach($imgs as $img){ 
      $src = $img->getAttribute('href'); 
      if(strpos($src, $url) !== 0){ 
       $img->setAttribute('a', $url.$src); 
      } 
     } 

     $html = $DOM->saveHTML(); 
     return $html; 
    } 

    function removeHref($html) 
    { 
     $DOM = new DOMDocument; 
     $DOM->loadHTML($html); 

     $imgs = $DOM->getElementsByTagName('a'); 
     foreach($imgs as $img){ 
      $src = $img->getAttribute('href'); 
      $img->setAttribute('href', "#"); 
     } 

     $html = $DOM->saveHTML(); 
     return $html; 
    } 


    function QuerySelector($dom, $xPath) 
    { 
     return $dom->query($xPath); 
    } 
    /* 
    function __destruct() { 
     # Close PHP/CURL handle 
     echo "Destruct Called.."; 
     curl_close($ch); 
    }*/ 


} 
?> 

模拟登录,并做你需要做什么:这是我用来登录到我oDesk帐户和刮招聘广告,我然后通过电子邮件发送给自己的例子:P

include("Business/Http/HttpHelper.php"); 
    $bot = new HttpHelper; 
    //$download = $bot ->Download("https://www.odesk.com/login"); 
    $data['username'] = "myusername"; 
    $data['password'] = "myPassword"; 
    $bot -> http_post_form("https://www.odesk.com/login", "https://www.odesk.com/login", $data); 

你欠我吧!