-2
我一直在使用以下脚本为我的客户网站创建站点地图。问题是它不适用于每个站点。我发现,如果不是所有托管在godaddy上的网站都不是蜘蛛网。如果任何人都可以在我的脚本中看到错误,或知道是什么导致错误,我将非常感谢帮助。php蜘蛛脚本不工作
在此先感谢
set_time_limit(0);
class spider_man
{
var $url;
var $limit;
var $cache;
var $crawled;
var $banned_ext;
var $domain;
function spider_man($url, $banned_ext, $limit){
$this->domain = $url;
$this->url = 'http://'.$url ;
$this->banned_ext = $banned_ext ;
$this->limit = $limit ;
if(!fopen($this->url, "r")) return false;
else $this->_spider($this->url);
}
function _spider($url){
$this->cache = @file_get_contents(urldecode($url));
if(!$this->cache) return false;
$this->crawled[] = urldecode($url) ;
preg_match_all("#href=\"(https?://[&=a-zA-Z0-9-_./]+)\"#si", $this->cache, $links);
if ($links) :
foreach ($links[1] as $hyperlink){
if(strpos($hyperlink,$this->domain)===false){ break; }
else{
$this->limit--;
if(! $this->limit) return;
if($this->is_valid_ext(trim($hyperlink)) and !$this->is_crawled($hyperlink)) :
$this->crawled[] = $hyperlink;
echo "Crawling $hyperlink<br />\n";
unset($this->cache);
$this->_spider($hyperlink);
endif;
}
}
endif;
}
function is_valid_ext($url){
foreach($this->banned_ext as $ext){
if($ext == substr($url, strlen($url) - strlen($ext))) return false;
}
return true;
}
function is_crawled($url){
return in_array($url, $this->crawled);
}
}
$banned_ext = array(".dtd",".css",".xml",".js",".gif",".jpg",".jpeg",".bmp",".ico",".rss",".pdf",".png",".psd",".aspx",".jsp",".srf",".cgi",".exe",".cfm");
$spider = new spider_man('domain.com', $banned_ext, 100);
print_r($spider->crawled);
什么似乎是问题? –
当您尝试在GoDaddy网站上运行它时会发生什么? – andrewsi
这些网站不返回任何网址。它只是返回一个空数组 – James