1
我一直在挑战屏幕刮这个网站。它与许多其他网站一起工作,但出于某种原因只能获取此页眉的页眉和页脚(http://www.coast-stores.com/SOPHIE-DRESS/Dresses/coast/fcp-product/2224724715)使用cURL屏幕刮只获取页眉和页脚
function get_url($url) {
$curl = curl_init();
$header[0] = "Accept: text/xml,application/xml,application/xhtml+xml,";
$header[0] .= "text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
$header[] = "Cache-Control: max-age=0";
$header[] = "Connection: keep-alive";
$header[] = "Keep-Alive: 300";
$header[] = "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7";
$header[] = "Accept-Language: en-us,en;q=0.5";
$header[] = "Pragma: ";
$cookie = '/cookies.txt';
$timeout = 30;
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 5.1; ru; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7 (.NET CLR 3.5.30729)');
curl_setopt($curl, CURLOPT_HTTPHEADER, $header);
curl_setopt($curl, CURLOPT_ENCODING, 'gzip,deflate');
curl_setopt($curl, CURLOPT_AUTOREFERER, true);
curl_setopt($curl, CURLOPT_REFERER, 'http://google.co.uk/');
curl_setopt($curl, CURLOPT_TIMEOUT, 20);
curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, $timeout);
curl_setopt($curl, CURLOPT_COOKIEJAR, $cookie);
curl_setopt($curl, CURLOPT_COOKIEFILE, $cookie);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); # required for https urls
curl_setopt($curl, CURLOPT_MAXREDIRS, 30);
curl_setopt($curl, CURLOPT_BINARYTRANSFER, true);
$responseHTML = curl_exec($curl);
$response = curl_getinfo($curl);
curl_close($curl); // close the connection
//return $html; // and finally, return $html
if ($response['http_code'] == 301 || $response['http_code'] == 302)
{
ini_set("user_agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20041001 Firefox/0.10.1");
if ($headers = get_headers($response['url']))
{
foreach($headers as $value)
{
if (substr(strtolower($value), 0, 9) == "location:")
return get_url(trim(substr($value, 9, strlen($value))));
}
}
}
if (
(preg_match("/>[[:space:]]+window\.location\.replace\('(.*)'\)/i", $content, $value)
|| preg_match("/>[[:space:]]+window\.location\=\"(.*)\"/i", $content, $value))
&& $javascript_loop < 5
)
{
return get_url($value[1], $javascript_loop+1);
}
else
{
return $responseHTML; //array($content, $response);
}
}
// uses the function and displays the text off the website
$text = get_url($_GET['url']);
echo $text;
任何想法为什么它没有获取主要内容? HTML显示后是否会传送内容?脚本的
与任何其他网站尝试它,它似乎工作!
感谢您的帮助!
该URL甚至在我的浏览器中不起作用。我每次都会收到一个301到主页。我甚至无法手动导航到该页面。我不认为它已经存在了。很难找到已删除的网址。 – 2011-05-11 22:31:17
如果您尝试使用实际存在的网址,则效果不错:http://www.mattfacer.com/scraping/scraping2.php?url=http://us.coast-stores.com/Coast-Allure-Maxi/dp/ B004SFF0V2 – 2011-05-11 22:34:45
这很奇怪 - 我想知道他们的网站是否在IP地址上使用某种国家检查?服务器处于状态。我认为你也是 - 因为我也可以看到整个页面的美元价格。然而,我可以在我的例子中看到网址,但不是内容。一切都很奇怪。 – 2011-05-12 09:08:39