2012-11-14 144 views
0

我使用这个类进行多线程cUrl文件下载;为什么多cURL下载文件的大小为0位?

在本地机器上下载文件内容,文件为空;

我在想什么?

<?php 

/** 
* Crawler class file. 
* 
* CRAWLER 
* 
*/ 
class CrawlerCommand extends CConsoleCommand { 

    private $instance_crawler_id; 
    private $instance_crawler_url_limit = 10; 
    public $multi_exec_curl_files = array(); 
    public $collection = array(); 
    public static $userAgents = array(
     'FireFox3' => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; pl; rv:1.9) Gecko/2008052906 Firefox/3.0', 
     'GoogleBot' => 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', 
     'IE7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)', 
     'Netscape' => 'Mozilla/4.8 [en] (Windows NT 6.0; U)', 
     'Opera' => 'Opera/9.25 (Windows NT 6.0; U; en)' 
    ); 
    public static $options = array(
     CURLOPT_USERAGENT => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)', 
     CURLOPT_AUTOREFERER => true, 
     CURLOPT_FOLLOWLOCATION => true, 
     CURLOPT_RETURNTRANSFER => true, 
     CURLOPT_FRESH_CONNECT => true, 
     CURLOPT_COOKIEJAR => "cookies.txt", 
     CURLOPT_COOKIEFILE => "cookies.txt", 
     CURLOPT_SSL_VERIFYPEER => false, 
     CURLOPT_CONNECTTIMEOUT => 5, 
     CURLOPT_TIMEOUT => 10, 
      //CURLOPT_COOKIESESSION => false, 
    ); 

    /** 
    * Executes the command. 
    * @param array command line parameters for this command. 
    */ 
    public function run($args) { 
     // $args gives an array of the command-line arguments for this command 

     for ($i = 0; $i < 50; $i++) { 
      echo PHP_EOL; 
     } 

     //check if we need to create a new crawler instance; if all html sources from the active url's have been downloaded, there is no point to recrawl 
     $day = date('d', time()); 
     $sql = "select * from `url` where `instance_crawler_day`!='$day' and `status`='1' order by `id` asc limit $this->instance_crawler_url_limit;"; 
     $cmd = Yii::app()->db->createCommand($sql); 
     $rows = $cmd->queryAll(); 

     $actual_files = count($rows); 
     //are there any files left to download for today ? 
     if ($actual_files > 0) { 

      //creating a new unique crawler instance 
      $model_instance_crawler = new InstanceCrawler(); 
      $model_instance_crawler->day = $day; 
      $model_instance_crawler->month = date('m', time()); 
      $model_instance_crawler->year = date('Y', time()); 
      if ($model_instance_crawler->save()) { 

       //locking x url's for the crawler to download time at a time 
       $sql = "update `url` set `instance_crawler_id`='$model_instance_crawler->id', `instance_crawler_day`='$model_instance_crawler->day' where `instance_crawler_day`!='$model_instance_crawler->day' and `status`='1' limit $this->instance_crawler_url_limit;"; 
       $cmd = Yii::app()->db->createCommand($sql); 
       $cmd->query(); 

       $robots_txt = new RobotsTXT(); 

       $robots_txt->load_robots_from_db(); 

       $time = strtotime('-60 days'); 

       $sql = "SELECT u.* 
FROM url AS u 
JOIN product p ON p.`url_id` = u.id 
JOIN product_follower pf ON pf.`product_id` = p.`id` AND pf.`created`>:time 
JOIN `user` us ON us.id = pf.`user_id` 
WHERE us.`status` = 1 
GROUP BY u.id 
order by `u`.`website_id` asc"; 

       //selecting x rows that belong to the new crawler instance 
       $sql = "select * from `url` where `instance_crawler_id`='$model_instance_crawler->id' and `instance_crawler_day`='$day' and `status`='1' order by `id` asc;"; 
       $cmd = Yii::app()->db->createCommand($sql); 
       $rows = $cmd->queryAll(); 

       if (count($rows) > 0) { 

        foreach ($rows as $row) { 
         $this->collection[$row['website_id']]['items'][] = $row; 
        } 

        foreach (array_keys($this->collection) as $key) { 

         $this->collection[$key]['urls'] = 0; 

         $collection2[] = $this->collection[$key]; 
        } 

        $this->collection = $collection2; 

        $collection2 = null; 

        $this->processCollection(); 

        //on dew, comment 
        $this->multiCurl($this->multi_exec_curl_files); 
        //CurlTool::downloadFile($url, $fileName, $fields = null, $verbose = false); 
       } 
      } 
     } else { 
      echo 'There are no files left to download today. Come back tomorow.' . PHP_EOL; 
     } 

     echo PHP_EOL . 'DONE' . PHP_EOL; 
    } 

    /** 
    * Provides the command description. 
    * This method may be overridden to return the actual command description. 
    * @return string the command description. Defaults to 'Usage: php entry-script.php command-name'. 
    */ 
    public function getHelp() { 
     return 'Usage: how to use this command'; 
    } 

    private function checkCounters() { 
     $status = false; 
     foreach ($this->collection as $key => $value) { 
      if ($value['urls'] < count($value['items'])) 
       $status = true; 
     } 
     return $status; 
    } 

    public function processCollection() { 
     //print'<pre>'; 
     //print_r($this->collection); 
     $w = 0; 
     while ($this->checkCounters()) { 
      foreach ($this->collection as $key => $value) 
       if ($value['urls'] < count($value['items'])) { 
        echo 'downloading file: ' . ($value['items'][$value['urls']]['id']) . '.html' . PHP_EOL; 
        //prepare the array for the multi thread cURL downloading process 
        $this->multi_exec_curl_files[$value['items'][$value['urls']]['id']] = array('link' => $value['items'][$value['urls']]['link']); 
        //$this->downloadFile($value['items'][$value['urls']]['url'], CRAWLER_FILES . ($value['items'][$value['urls']]['id']) . '.html'); 
        //echo $value['items'][$value['urls']]['link'].'<br>'; 
        //CurlTool::downloadFile($value['items'][$value['urls']]['link'], ($value['items'][$value['urls']]['id']) . '.html'); 
        $this->collection[$key]['urls']++; 
       } 
      $w++; 
      if ($w > count($this->collection)) 
       $w = 0; 
     } 
    } 

    public function multiCurl($res, $options = "") { 

     if (count($res) <= 0) 
      return False; 

     $handles = array(); 

     if (!$options) // add default options 
      $options = self::$options; 

     // add curl options to each handle 
     foreach ($res as $k => $row) { 
      $ch{$k} = curl_init(); 
      $options[CURLOPT_URL] = $row['link']; 
      curl_setopt_array($ch{$k}, $options); 
      $handles[$k] = $ch{$k}; 
     } 

     $mh = curl_multi_init(); 

     foreach ($handles as $k => $handle) { 
      curl_multi_add_handle($mh, $handle); 
     } 

     $running_handles = null; 
     //execute the handles 
     do { 
      $status_cme = curl_multi_exec($mh, $running_handles); 
     } while ($cme == CURLM_CALL_MULTI_PERFORM); 

     while ($running_handles && $status_cme == CURLM_OK) { 
      if (curl_multi_select($mh) != -1) { 
       do { 
        $status_cme = curl_multi_exec($mh, $running_handles); 
       } while ($status == CURLM_CALL_MULTI_PERFORM); 
      } 
     } 

     foreach ($res as $k => $row) { 
      $res[$k]['error'] = curl_error($handles[$k]); 
      print_r($res[$k]['error']); 
      if (!empty($res[$k]['error'])) { 
       $res[$k]['data'] = ''; 
      } else { 
       //$res[$k]['data'] = curl_multi_getcontent($handles[$k]); // get results 
       file_put_contents(CRAWLER_FILES . $k . '.html', curl_multi_getcontent($handles[$k])); 
      } 

      // close current handler 
      curl_multi_remove_handle($mh, $handles[$k]); 
     } 
     curl_multi_close($mh); 
     return $res; // return response 
    } 

} 
+0

你检查HTTP状态代码的含义是什么? – PaulSkinner

+0

它的存在,但不能下载内容 –

+0

是的,但它是否得到了403禁止返回,或其他错误状态代码? – PaulSkinner

回答

0

答案是使用http://www.somacon.com/p537.php

发现我错了一些变量名,并通过比较两个代码,我发现他们

<?php 
// LICENSE: PUBLIC DOMAIN 
// The author disclaims copyright to this source code. 
// AUTHOR: Shailesh N. Humbad 
// SOURCE: http://www.somacon.com/p539.php 
// DATE: 6/4/2008 

// index.php 
// Run the parallel get and print the total time 
$s = microtime(true); 
// Define the URLs 
$urls = array(
    "http://localhost/r.php?echo=request1", 
    "http://localhost/r.php?echo=request2", 
    "http://localhost/r.php?echo=request3" 
); 
$pg = new ParallelGet($urls); 
print "<br />total time: ".round(microtime(true) - $s, 4)." seconds"; 

// Class to run parallel GET requests and return the transfer 
class ParallelGet 
{ 
    function __construct($urls) 
    { 
    // Create get requests for each URL 
    $mh = curl_multi_init(); 
    foreach($urls as $i => $url) 
    { 
     $ch[$i] = curl_init($url); 
     curl_setopt($ch[$i], CURLOPT_RETURNTRANSFER, 1); 
     curl_multi_add_handle($mh, $ch[$i]); 
    } 

    // Start performing the request 
    do { 
     $execReturnValue = curl_multi_exec($mh, $runningHandles); 
    } while ($execReturnValue == CURLM_CALL_MULTI_PERFORM); 
    // Loop and continue processing the request 
    while ($runningHandles && $execReturnValue == CURLM_OK) { 
     // Wait forever for network 
     $numberReady = curl_multi_select($mh); 
     if ($numberReady != -1) { 
     // Pull in any new data, or at least handle timeouts 
     do { 
      $execReturnValue = curl_multi_exec($mh, $runningHandles); 
     } while ($execReturnValue == CURLM_CALL_MULTI_PERFORM); 
     } 
    } 

    // Check for any errors 
    if ($execReturnValue != CURLM_OK) { 
     trigger_error("Curl multi read error $execReturnValue\n", E_USER_WARNING); 
    } 

    // Extract the content 
    foreach($urls as $i => $url) 
    { 
     // Check for errors 
     $curlError = curl_error($ch[$i]); 
     if($curlError == "") { 
     $res[$i] = curl_multi_getcontent($ch[$i]); 
     } else { 
     print "Curl error on handle $i: $curlError\n"; 
     } 
     // Remove and close the handle 
     curl_multi_remove_handle($mh, $ch[$i]); 
     curl_close($ch[$i]); 
    } 
    // Clean up the curl_multi handle 
    curl_multi_close($mh); 

    // Print the response data 
    print_r($res); 
    } 

} 
?> 

<?php 
// r.php 
// This script runs a variable amount of time 
// and generates a variable amount of data 

// Output a random amount of blank space 
$s = microtime(true); 
$m = rand(500,1000); 
for($i = 0; $i < $m; $i++) { 
    print "   \n"; 
    usleep(10); 
} 

// Print time taken and the value of the "echo" parameter 
print isset($_REQUEST["echo"]) ? $_REQUEST["echo"] : ""; 
print " in "; 
print round(microtime(true) - $s, 4)." seconds"; 
exit(); 
?> 
相关问题