2016-06-25 38 views
0

我一直在努力刮刀一个月,并试图废除从MySQL获取的链接(HREF)。废弃更快的PHP递归刮板

我已经施加在其上的许多技术尽可能 我试图与

  1. 与卷曲呼叫用于并行处理EXEC壳调用它。
  2. EXEC外壳调用PHP脚本的并行处理
  3. 尝试并行线程这是不正确的,它的工作(不知道为什么)

我递归调用函数来抓住从网站,然后抓取链接这些链接进一步。 (过滤无效链接(#,javascript(无效)等)在大约30分钟内5到60万条记录,它们很可能是重复的结果。如果我查询不同的值从这个记录,我只拿到了50,000条记录

这里是我的代码

function multiRequest($urls) { 

global $link; 



$filter_links = array(); 
$rolling_window = sizeof($urls); 

$master = curl_multi_init(); 


// add additional curl options here 
$std_options = array(CURLOPT_RETURNTRANSFER => true, 
    CURLOPT_FOLLOWLOCATION => true, 
    CURLOPT_CONNECTTIMEOUT => 35, 
    CURLOPT_HEADER => false, 
    CURLOPT_TIMEOUT => 30); 
$options = $std_options; 

// start the first batch of requests 
for ($i = 0; $i < $rolling_window; $i++) { 
    $ch = curl_init(); 
    $options[CURLOPT_URL] = $urls[$i]; 
    $options[CURLOPT_PRIVATE] = $urls[$i]; 
    curl_setopt_array($ch, $options); 
    curl_multi_add_handle($master, $ch); 
} 

do { 
    while (($execrun = curl_multi_exec($master, $running)) == CURLM_CALL_MULTI_PERFORM); 
    if ($execrun != CURLM_OK) { 
     break; 
    } 
    // a request was just completed -- find out which one 
    while ($done = curl_multi_info_read($master)) { 

     $available_curl = curl_getinfo($done['handle'], CURLINFO_PRIVATE); 

     $html = curl_multi_getcontent($done['handle']); 

     $domDoc = new DOMDocument('1.0'); 
     @$domDoc->loadHTML($html); 

     $anchors = $domDoc->getElementsByTagName('a'); 
     foreach ($anchors as $element) { 
      $href = $element->getAttribute('href'); 
      $href = rtrim($href, "/"); 
      $href = trim($href); 

      if ((strpos($href, '#') !== false) || $href == '' || $href == $available_curl || (strpos($href, 'javascript:') !== false) || (strpos($href, 'index.php') !== false) || preg_match('/mailto:/', $href) || (strpos($href, '.jpg') !== false) || (strpos($href, '.jpeg') !== false) || (strpos($href, '.png') !== false) || 
        (strpos($href, '.gif') !== false) || (strpos($href, '.tiff') !== false) || (strpos($href, '.tif') !== false) || (strpos($href, '.pdf') !== false)) { 
       continue; 
      } 
      if (0 !== strpos($href, 'http')) { 
       $path = '/' . ltrim($href, '/'); 


       $parts = parse_url($available_curl); 

       $href = $parts['scheme'] . '://'; 

       $href .= $parts['host']; 
       if (isset($parts['port'])) { 
        $href .= ':' . $parts['port']; 
       } 
       $href .=$path; 
      } 


       $href = rtrim($href, "/"); 
       $filter_links[] = $href; 

     } 

     $filter_links = array_unique($filter_links); 
     $scraped_domain = remove_http($available_curl); 
     $scraped_domain_key = key_domain_generator($scraped_domain); 
     mysqli_query($link, "UPDATE domains SET is_scraped=1, total_scraped_links = '" . count($filter_links) . "' WHERE domain_u_key = '" . $scraped_domain_key . "'") or die(mysqli_error($link)); 
     $namecheap_filter_internal_array=extrnl_intrnl_filter($filter_links, $available_curl); 

     curl_multi_remove_handle($master, $done['handle']); 
    } 
} while ($running); 

curl_multi_close($master); 
if (count($namecheap_filter_internal_array) > 0) { 

    multiRequest($namecheap_filter_internal_array); 
} 

}

function extrnl_intrnl_filter($href_array, $domain_link) { 

global $link; 
$is_external = 0; 
$workers = []; 
$x_count=0; 
foreach ($href_array as $href) { 
    $href_url = parse_url($href); 
    $href_domain = $href_url['host']; 
    $key_href = giveHost($href_domain); 
    if (isexternal($href_domain, $domain_link) == 'External') { 
     $domains_Query = "select count(*) as domain_found from domains where base_url='$key_href'"; 
     $domains_run_Query = mysqli_query($link, $domains_Query) or die(mysqli_error($link)); 
     $domaininfo = mysqli_fetch_assoc($domains_run_Query); 
     if ($domaininfo['domain_found'] > 0) { 

     } else { 
      if (preg_match('/^[-a-z0-9]+\.[a-z]{2,6}$/', strtolower($key_href))) { 
       $is_external = 1; 
       if (domain_insert_check($href, $is_external)) { 
        echo 'prgress'; 
        $workers[$x_count] = new WorkerThreads($href); 
        $workers[$x_count]->start(); 
        $x_count++; 


        //exec("nohup curl --url http://37.59.1.141/tool2/index2.php?data=" . $domain_list_scrap . " > /dev/null 2> /dev/null &"); 
        //exec("nohup php /var/www/test/tool2/index2.php " . $href . " > /dev/null 2> /dev/null &"); 

        //exec("nohup php /var/www/test/tool2/index2.php?data=" . $href . " > /dev/null 2> /dev/null &"); 
        //exec("nohup curl --url http://37.59.1.141/tool2/index2.php?data=" . $href . " > /dev/null 2> /dev/null &"); 
       } 
      } 
     } 
    } else { 
     $is_external = 0; 
     if (domain_insert_check($href, $is_external)) { 
      $workers[$x_count] = new WorkerThreads($href); 
      $workers[$x_count]->start(); 
      $x_count++; 
      $namecheap_filter_internal_array[] = $href; 

     } 
    } 
} 
for ($forvar=0;$forvar<$x_count;$forvar++) { 
    $workers[$forvar]->join(); 
} 

return array_unique($namecheap_filter_internal_array); 

}

function domain_insert_check($href, $is_external) { 
global $link; 
$href_url = parse_url($href); 
$href_ex_https = remove_http($href); 
$href_domain = $href_url['host']; 
$href_scheme = $href_url['scheme']; 
$key_href_i = key_domain_generator($href_ex_https); 

$query = "insert into domains set domain_name = '" . addslashes($href_ex_https) . "'," 
     . "doamin_schema = '" . $href_scheme . "'," 
     . "base_url = '" . strtolower(giveHost($href_domain)) . "'," 
     . "domain_u_key = '" . $key_href_i . "'," 
     . "is_expired = '0'," 
     . "is_scraped = '0'," 
     . "is_external = '" . $is_external . "'," 
     . "ExtBackLinks = '0'," 
     . "RefDomains='0'," 
     . "ACRank = '0'," 
     . "RefIPs = '0'," 
     . "RefSubNets = '0'," 
     . "RefDomainsEDU = '0'," 
     . "RefDomainsGOV = '0'," 
     . "Title = 'title'," 
     . "total_scraped_links = '0'," 
     . "CitationFlow = '0'," 
     . "TrustFlow = '0'," 
     . "TopicalTrustFlow_Topic_0 = 'TopicalTrustFlow_Topic_0'," 
     . "TopicalTrustFlow_Value_0 = '0'," 
     . "TopicalTrustFlow_Topic_1 = 'TopicalTrustFlow_Topic_1'," 
     . "TopicalTrustFlow_Value_1 = '0'," 
     . "TopicalTrustFlow_Topic_2 = 'TopicalTrustFlow_Topic_2'," 
     . "TopicalTrustFlow_Value_2 = '0'," 
     . "date_created = '" . date('Y-m-d H:i:s') . "'," 
     . "user_id = 1"; 

$result = mysqli_query($link, $query); 
if (!$result) { 
    mysqli_query($link, "insert into domainerror SET error = '" . $key_href_i . "' , domains= '" . $href_ex_https . "', type='fail'"); 

    return false; 
} else { 
    return true; 
} 

}

我真的没有任何想法如何,我可以优化它 ,以便它可以抓住更多的记录,然后 我优化它,只要我能 如果我使用PHP调用,而不是卷曲,它会窒息MySQL最大连接 如果我使用pthread,它运行第一次,然后停止

+0

一个友好的机器人不会试图以最快的速度刮地盘。尝试将其限制为每分钟几页。最后你会得到你需要的数据。考虑到robots.txt,并在浏览器中显示你是谁,你是什么。不要试图成为一个糟糕的机器人。 –

回答

0

我的第一个建议是将删除DOMDocument替换为正则表达式,这是更好,更快,内存占用更少,时间更短解析。

其他较小的改进将是用o(1)代替子阵列搜索,如果可能的话使用散列图。

$filter_links = array_unique($filter_links); 

因此,你应该有一个$ urlMap [$ urlKey] = $ url; 如果您没有找到它,请继续插入它。计算密钥的快速方法可能是使用md5,但有更快的方法。

从我看到的另一个大I/O问题是,您为每个抓取的网站插入数据库。而不是这样做,你可以用数据分成另一个数组,最后将所有的网站数据插入你的sql服务器。

不过,您将获得一些加速,但为了扩展,您必须考虑将流程拆分为多个服务器的方法。为此,您需要一个队列系统,您可以使用RabbitMq https://www.rabbitmq.com/