2011-08-14 49 views
1

在下面的函数中,我想指定一个从结果中排除的域列表。有什么选择?数组集合要排除?排除路径链接的网址?

class KeywordSearch 
{  
    const GOOGLE_SEARCH_XPATH = "//a[@class='l']"; 
    public $searchQuery; 
    public $numResults ; 
    public $sites; 
    public $finalPlainText = ''; 
    public $finalWordList = array(); 
    public $finalKeywordList = array(); 

    function __construct($query,$numres=7){ 
     $this->searchQuery = $query; 
     $this->numResults = $numres; 
     $this->sites = array(); 
    } 

    protected static $_excludeUrls = array('wikipedia.com','amazon.com','youtube.com','zappos.com');//JSB NEW 

    private function getResults($searchHtml){ 

     $results = array(); 
     $dom = new DOMDocument(); 
     $dom->preserveWhiteSpace = false; 
     $dom->formatOutput = false; 
     @$dom->loadHTML($searchHtml); 
     $xpath = new DOMXpath($dom); 
     $links = $xpath->query(self::GOOGLE_SEARCH_XPATH); 

     foreach($links as $link) 
     { 
      $results[] = $link->getAttribute('href');   
     } 

     $results = array_filter($results,'self::kwFilter');//JSB NEW 
     return $results; 
    } 

    protected static function kwFilter($value) 
    { 
     return !in_array($value,self::$_excludeUrls); 
    } 

回答

1
protected static $_banUrls = array('foo.com','bar.com'); 

private function getResults($searchHtml){ 

     $results = array(); 

     $dom = new DOMDocument(); 

     $dom->preserveWhiteSpace = false; 

     $dom->formatOutput = false; 

     @$dom->loadHTML($searchHtml); 

     $xpath = new DOMXpath($dom); 

     $links = $xpath->query(self::GOOGLE_SEARCH_XPATH); 


     foreach($links as $link) 
     { 
     //FILTER OUT SPECIFIC LINKS HERE 
      $results[] = $link->getAttribute('href'); 

     } 
     $results = array_filter($results,'self::myFilter'); 

     return $results; 

    } 

    protected static function myFilter($value) 
    { 
      return !in_array($value,self::$_banUrls); 
    } 
+0

+1看起来不错Jason。谢谢! –

+0

@Scott B很高兴它帮助你,如果你觉得它有用,请接受它。 –

+0

我收到一个错误>“第二个参数'myFilter'应该是一个有效的回调” –

1

既然你标记这个XPath,这里是如何与XPath contain function:

$html = <<< HTML 
<ul> 
    <li><a href="http://foo.example.com"> 
    <li><a href="http://bar.example.com"> 
    <li><a href="http://baz.example.com"> 
</ul> 
HTML; 

$dom = new DOMDocument; 
$dom->loadHtml($html); 
$xp = new DOMXPath($dom); 
$query = '//a/@href[ 
    not(contains(., "foo.example.com")) and 
    not(contains(., "bar.example.com")) 
]'; 
foreach ($xp->query($query) as $hrefAttr) { 
    echo $hrefAttr->nodeValue; 
} 

做这将输出:

http://baz.example.com 

Xpath 1.0. specification for other possible string functions测试节点集。