2014-11-24 22 views
0

我试图通过使用CURL来抓取网站。到目前为止,我已经写了下面的:由于特殊字符而无法使用PHP解析HTML内容

卷曲类:

<?php 

class Curl 
{  

    public $cookieJar = ""; 

    public function __construct($cookieJarFile = 'cookies.txt') { 
     $this->cookieJar = $cookieJarFile; 
    } 

    function setup() 
    { 


     $header = array(); 
     $header[0] = "Accept: text/xml,application/xml,application/xhtml+xml,"; 
     $header[0] .= "text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"; 
     $header[] = "Cache-Control: max-age=0"; 
     $header[] = "Connection: keep-alive"; 
     $header[] = "Keep-Alive: 300"; 
     $header[] = "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7"; 
     $header[] = "Accept-Language: en-us,en;q=0.5"; 
     $header[] = "Pragma: "; // browsers keep this blank. 


     curl_setopt($this->curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.8.1.7) Gecko/20070914 Firefox/2.0.0.7'); 
     curl_setopt($this->curl, CURLOPT_HTTPHEADER, $header); 
     curl_setopt($this->curl,CURLOPT_COOKIEJAR, $cookieJar); 
     curl_setopt($this->curl,CURLOPT_COOKIEFILE, $cookieJar); 
     curl_setopt($this->curl,CURLOPT_AUTOREFERER, true); 
     curl_setopt($this->curl,CURLOPT_FOLLOWLOCATION, true); 
     curl_setopt($this->curl,CURLOPT_RETURNTRANSFER, true); 
    } 


    function get($url) 
    { 
     $this->curl = curl_init($url); 
     $this->setup(); 

     return $this->request(); 
    } 

    function getAll($reg,$str) 
    { 
     preg_match_all($reg,$str,$matches); 
     return $matches[1]; 
    } 

    function postForm($url, $fields, $referer='') 
    { 
     $this->curl = curl_init($url); 
     $this->setup(); 
     curl_setopt($this->curl, CURLOPT_URL, $url); 
     curl_setopt($this->curl, CURLOPT_POST, 1); 
     curl_setopt($this->curl, CURLOPT_REFERER, $referer); 
     curl_setopt($this->curl, CURLOPT_POSTFIELDS, $fields); 
     return $this->request(); 
    } 

    function getInfo($info) 
    { 
     $info = ($info == 'lasturl') ? curl_getinfo($this->curl, CURLINFO_EFFECTIVE_URL) : curl_getinfo($this->curl, $info); 
     return $info; 
    } 

    function request() 
    { 
     return curl_exec($this->curl); 
    } 
} 

?> 

然后我打电话给我的PHP文件此卷曲类:

include_once("curl.php"); 
$curl = new Curl(); 
$html = $curl->get("www.somewebsite.com"); 
$html = htmlentities($html); 
//echo $html; 
$pattern = htmlentities("<span class=\"review-text\">"); 
function get_string_between($string, $start, $end) 
{ 
    $string = " ".$string; 
    $ini = strpos($string,$start); 
    if ($ini == 0) 
     return ""; 
    $ini += strlen($start); 
    $len = strpos($string,$end,$ini) - $ini; 
    return substr($string,$ini,$len); 
} 
echo get_string_between($html, '<span class=\"review-text\">', '<\/span>'); 

现在,我想得到两个字符串之间的字符串,我得到一个空白页。但是,当我看到html内容时,我显然能够发现该字符串。

HTML内容非常大,我试图搜索并获取巨大文件之间的内容。

我甚至试图用“&lt;”符号替换“<”符号,但它似乎没有找到字符串。

+0

使用htmlspecialchars http://php.net/manual/en/function.htmlspecialchars.php – 2014-11-24 12:53:28

+0

@MohamadAttat:它似乎并没有这样工作。 – 2014-11-24 12:55:13

回答

1

'有一个更好的方法来获取html标记的值,通过使用dom。

$dom = new DomDocument(); 
@$dom -> loadHTML($html); 
$dom -> preserveWhiteSpace = false; 
$spans = getElementsByTagName('span'); 
foreach($spans as $span){ 
    if($span -> getAttribute('class') == 'review-text'){ print $span-> nodeValue } 
} 

或者有另一种方式:

$dompath = new DOMXPath($dom); 
$review_div = $dompath -> query('//*[@class="review-text"]')->item(0) 
$string = $review_div -> nodeValue; 

希望这有助于你。

+0

此方法给我一个500内部服务器错误。任何想法为什么? – 2014-11-25 04:52:42

+1

500错误是服务器端,这意味着服务器有问题。我不认为它与php脚本有任何关系。或者,如果您回应curl响应,则可能是您尝试抓取的页面有错误。 – spreadzz 2014-11-25 12:31:07