2015-02-08 85 views
1

我想从一个页面中找到所有不嵌套在斜体标签中的锚点。 这是我有什么,和它的作品,但链接不按照正确的顺序处理(按照页面的源代码)PHP XPath查询

@$dom->loadHTML($this->html); 
$xpath = new DOMXpath($dom); 
$anchorlinks = $xpath->query('//a/@href[not(. = //i//a/@href)]'); 

任何意见,以我应该如何着手中DIFF在两套xpath查询将不胜感激。

谢谢。

$phil = gettingToPhilosophy("http://en.wikipedia.org/Yarn"); 
for($i=0; $i<30; $i++) 
{ 
    $phil->hop(); 
    $phil->processHTML(); 
} 

<?php 
class gettingToPhilosophy 
{ 
    public $base_url; //base_url to start with 
    public $target_url; //url to hop to 
    public $previous_link; //keep track of last link 
    public $lookup; //cached array of visited links 
    public $curl; //curl object to execute 
    public $html; //html retrieved from curl request 
    public $conn; //database connection resource 
    public $hoplimit; //maximum number of hops (23 was the median as per the wikipedia article) 
    public $hop_num; //the number of hops taken to reach the philosophy page 
    public $id; //id of current link (Primary Key) 
    public $child_id; //id of next link 

    function __construct($base_url) 
    { 
    $this->base_url = filter_var($base_url, FILTER_VALIDATE_URL); 

    //determine if url is valid 
    if (!($this->base_url)) 
    { 
     die("<font color='red'>Invalid URL</font>"); 
    } 

    $this->target_url = parse_url($base_url, PHP_URL_PATH); 
    $this->previous_link = ''; 
    $this->lookup = array(); 
    $this->curl = curl_init(); 

    // Create a user agent as to not get blocked by wikipedia 
    $userAgent = 'Googlebot/2.1 (http://www.google.bot.com/bot.html)'; 

    // Initialize curl and following options 
    //curl_setopt($this->curl, CURLOPT_USERAGENT, $userAgent); 
    //curl_setopt($this->curl, CURLOPT_FAILONERROR, true); 
    //curl_setopt($this->curl, CURLOPT_FOLLOWLOCATION, true); 
    //curl_setopt($this->curl, CURLOPT_AUTOREFERER, true); 
    //curl_setopt($this->curl, CURLOPT_RETURNTRANSFER,true); 
    //curl_setopt($this->curl, CURLOPT_TIMEOUT, 10); 

    //$this->conn = pg_connect("dbname=Wesley user=Wesley host=localhost") or die("Can't connect to database".pg_last_error()); 
    $this->hoplimit = 30; 
    $this->hop_num = 0; 
    $this->id = 1; 
    $this->child_id = 0; 
    } 

    function __destruct() 
    { 
    $this->base_url = null; 
    $this->target_url = null; 
    $this->previous_link = null; 
    $this->curl = null; 
    $this->lookup = null; 
    //pg_close($this-conn); 
    $this->conn = null; 
    $this->id = null; 
    $this->child_id = null; 
    } 

    function hop() 
    { 
    //Error handling for cached results of links 
    if (isset($this->lookup[$this->target_url])) 
    { 
     //printLinks(); 
     die("<font color='red'>Never ending loop: $this->target_url has already been seen</font>"); 
    } 

    $this->lookup[$this->target_url] = 1; //cache the link 

    $this->child_id++; 
    $sql = "insert into Philosophy (base_url, childid, link) values('$this->base_url', $this->child_id, '$this->target_url')"; 
    //pg_execute($conn,$sql); 
    echo "$sql <br/>"; 

    //append nodeValue to wikipedia url scheme 
    $this->target_url = "http://en.wikipedia.org".$this->target_url; 

    // Reset url 
    $userAgent = 'Googlebot/2.1 (http://www.google.bot.com/bot.html)';  

    // Initialize curl and following options 
    curl_setopt($this->curl, CURLOPT_USERAGENT, $userAgent); 
    curl_setopt($this->curl, CURLOPT_FAILONERROR, true); 
    curl_setopt($this->curl, CURLOPT_FOLLOWLOCATION, true); 
    curl_setopt($this->curl, CURLOPT_AUTOREFERER, true); 
    curl_setopt($this->curl, CURLOPT_RETURNTRANSFER,true); 
    curl_setopt($this->curl, CURLOPT_TIMEOUT, 10); 
    curl_setopt($this->curl, CURLOPT_URL,$this->target_url); 

    // Get html from the page 
    $this->html = curl_exec($this->curl); 

    // Error handling for invalid link 
    if(!$this->html) 
    { 
     //$this->printLinks(); 

     //target_url was invalid or not reachable 
     die("<font color='red'>$this->target_url is invalid or unreachable - Hopped $this->hop_num times</font>"); 
    } 
    $this->hop_num++; 
    } 

    function processHTML() 
    { 
    $dom = new DOMDocument(); 
    @$dom->loadHTML($this->html); 

    $xpath = new DOMXpath($dom); 
    $anchorlinks = $xpath->query('//a[not(ancestor::i)]/@href'); 

    //$anchorlinks = $dom->getElementsByTagName('a'); 
    echo "<pre>"; print_r(iterator_to_array($anchorlinks)); echo "</pre>"; exit;  

    foreach($anchorlinks as $anchorlink) 
    { 
     if (!$this->isValid($anchorlink->nodeValue)){ continue; } 

     $this->previous_link = $this->target_url; 
     $this->target_url = "$anchorlink->nodeValue"; 
     $flag = true; 
     break; 

     /*foreach($anchorlink->attributes as $attribute) 
     { 
     //skip erroneous links 
     if ($attribute->nodeName !== 'href') {continue;} 
     if (!$this->isValid($attribute->nodeValue)){ continue; } 

     $this->target_url = "$attribute->nodeValue"; 
     $flag = true; 
     break; 
     }*/ 
    } 
    } 

    function isValid($link) 
    { 
    if ($link === $this->previous_link){ return false; } 

    //links to ignore 
    if (strstr($link, '#') || stristr($link, 'Help:') || stristr($link, 'navigation') || stristr($link,'[note') 
    || strstr($link, '(') || stristr($link, 'File:') || strstr($link, '.jpg') || strstr($link, '?') || stristr($link, 'http') 
    || strstr($link, '//') || stristr($link, 'Portal:') || stristr($link, 'Special:') || stristr($link, 'Wikipedia:') 
    || stristr($link, 'Talk:') || stristr($link, 'Category:') || stristr($link, 'Main_Page')) 
    { 
     return false; 
    } 

    return true; 
    } 

    function printLinks() 
    { 
    $sql = "select childid, link from philosophy where base_url='$this->base_url'"; 
    if ($result = pg_execute($conn, $sql)) 
    { 
     while ($row = pg_fetch_assoc($result)) 
     { 
     echo "{$row['childid']}) {$row['link']} <br/>"; 
     } 
    } 
    } 
} 
?> 

回答

2

我想找到了两个未嵌套在一个斜体字标签

然后页面中的所有锚,你应该宁愿使用

//a[not(ancestor::i)]/@href 

这正是这么做的,如果它们不是i元素的后代,则可以找到a元素的所有href属性。


在XPath 1.0的实现之间,结果集中节点的顺序可能会有所不同。使用兼容的XPath处理器将上述XPath表达式应用到http://en.wikipedia.org/wiki/Wikipedia:Getting_to_Philosophy的结果(个别结果由-----分隔):

href="#mw-head" 
----------------------- 
href="#p-search" 
----------------------- 
href="/wiki/File:Essay.svg" 
----------------------- 
href="/wiki/Wikipedia:Wikipedia_essays" 
----------------------- 
href="/wiki/Wikipedia:Policies_and_guidelines" 
----------------------- 
href="/wiki/Hyperlink" 
----------------------- 
href="/wiki/Wikipedia" 
----------------------- 
href="/wiki/Philosophy" 
----------------------- 
href="/wiki/Philosophy" 
----------------------- 
href="#cite_note-1" 
----------------------- 
href="/wiki/File:Crawl_on_Wikipedia_from_random_article_to_Philosophy..gif" 
----------------------- 
href="/wiki/File:Crawl_on_Wikipedia_from_random_article_to_Philosophy..gif" 
----------------------- 
href="/wiki/Document_classification" 
----------------------- 
href="/wiki/Wikipedia:MOSBEGIN" 
----------------------- 
href="/wiki/Mathematics" 
----------------------- 
href="/wiki/Science" 
----------------------- 
href="/wiki/Language" 
----------------------- 
href="/wiki/Philosophy" 
----------------------- 
href="#Method_summarized" 
----------------------- 
href="#Origins" 
----------------------- 
href="#Examples_of_exceptions_to_the_Getting_to_Philosophy_rule" 
----------------------- 
href="#See_also" 
----------------------- 
href="#References" 
----------------------- 
href="#External_links" 
----------------------- 
href="/w/index.php?title=Wikipedia:Getting_to_Philosophy&amp;action=edit&amp;section=1" 
----------------------- 
href="/w/index.php?title=Wikipedia:Getting_to_Philosophy&amp;action=edit&amp;section=2" 
----------------------- 
href="/wiki/Phenomenon" 
----------------------- 
href="/wiki/User:Mark_J" 
----------------------- 
href="#cite_note-2" 
----------------------- 
href="/wiki/Wikipedia:WikipediaWeekly/Episode50" 
----------------------- 
href="/wiki/Podcast" 
----------------------- 
href="#cite_note-3" 
----------------------- 
href="/w/index.php?title=Wikipedia:Getting_to_Philosophy&amp;action=edit&amp;section=3" 
----------------------- 
href="/wiki/Yarn" 
----------------------- 
href="/wiki/Fibres" 
----------------------- 
href="/wiki/Rope" 
----------------------- 
href="/w/index.php?title=Wikipedia:Getting_to_Philosophy&amp;action=edit&amp;section=4" 
----------------------- 
href="/wiki/Small-world_network" 
----------------------- 
href="/wiki/Attractor" 
----------------------- 
href="/wiki/Wikipedia:Wiki_Game" 
----------------------- 
href="/w/index.php?title=Wikipedia:Getting_to_Philosophy&amp;action=edit&amp;section=5" 
----------------------- 
href="#cite_ref-1" 
----------------------- 
href="/wiki/User:Ilmari_Karonen/First_link" 
----------------------- 
href="/wiki/Help:CS1_errors#cite_web_url" 
----------------------- 
href="#cite_ref-2" 
----------------------- 
href="http://en.wikipedia.org/w/index.php?title=Wikipedia:Getting_to_Philosophy&amp;oldid=215744293" 
----------------------- 
href="#cite_ref-3" 
----------------------- 
href="http://huffduffer.com/psd/42471" 
----------------------- 
href="/w/index.php?title=Wikipedia:Getting_to_Philosophy&amp;action=edit&amp;section=6" 
----------------------- 
href="http://www.xefer.com/wikipedia" 
----------------------- 
href="http://www.youtube.com/watch?v=vehDe2lSptU" 
----------------------- 
href="/wiki/Philosophy" 
----------------------- 
href="http://matpalm.com/blog/2011/08/13/wikipedia-philosophy/" 
----------------------- 
href="http://xkcd.com/903/" 
----------------------- 
href="/wiki/Xkcd" 
----------------------- 
href="/wiki/Tooltip" 
----------------------- 
href="http://wikiloopr.com/" 
----------------------- 
href="http://www.guardian.co.uk/technology/2011/jul/10/only-way-essex-wikipedia-philosophy" 
----------------------- 
href="/wiki/The_Guardian" 
----------------------- 
href="http://www.huffingtonpost.com/2011/11/14/wikipedia-philosophy_n_1093460.html" 
----------------------- 
href="http://en.wikipedia.org/w/index.php?title=Wikipedia:Getting_to_Philosophy&amp;oldid=645649870" 
----------------------- 
href="/wiki/Help:Category" 
----------------------- 
href="/wiki/Category:Wikipedia_essays" 
----------------------- 
href="/wiki/Category:Pages_using_web_citations_with_no_URL" 
----------------------- 
href="/w/index.php?title=Special:UserLogin&amp;returnto=Wikipedia:Getting+to+Philosophy&amp;type=signup" 
----------------------- 
href="/w/index.php?title=Special:UserLogin&amp;returnto=Wikipedia:Getting+to+Philosophy" 
----------------------- 
href="/wiki/Wikipedia:Getting_to_Philosophy" 
----------------------- 
href="/wiki/Wikipedia_talk:Getting_to_Philosophy" 
----------------------- 
href="#" 
----------------------- 
href="/wiki/Wikipedia:Getting_to_Philosophy" 
----------------------- 
href="/w/index.php?title=Wikipedia:Getting_to_Philosophy&amp;action=edit" 
----------------------- 
href="/w/index.php?title=Wikipedia:Getting_to_Philosophy&amp;action=history" 
----------------------- 
href="#" 
----------------------- 
href="/wiki/Main_Page" 
----------------------- 
href="/wiki/Main_Page" 
----------------------- 
href="/wiki/Portal:Contents" 
----------------------- 
href="/wiki/Portal:Featured_content" 
----------------------- 
href="/wiki/Portal:Current_events" 
----------------------- 
href="/wiki/Special:Random" 
----------------------- 
href="https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&amp;utm_medium=sidebar&amp;utm_campaign=C13_en.wikipedia.org&amp;uselang=en" 
----------------------- 
href="//shop.wikimedia.org" 
----------------------- 
href="/wiki/Help:Contents" 
----------------------- 
href="/wiki/Wikipedia:About" 
----------------------- 
href="/wiki/Wikipedia:Community_portal" 
----------------------- 
href="/wiki/Special:RecentChanges" 
----------------------- 
href="//en.wikipedia.org/wiki/Wikipedia:Contact_us" 
----------------------- 
href="/wiki/Special:WhatLinksHere/Wikipedia:Getting_to_Philosophy" 
----------------------- 
href="/wiki/Special:RecentChangesLinked/Wikipedia:Getting_to_Philosophy" 
----------------------- 
href="/wiki/Wikipedia:File_Upload_Wizard" 
----------------------- 
href="/wiki/Special:SpecialPages" 
----------------------- 
href="/w/index.php?title=Wikipedia:Getting_to_Philosophy&amp;oldid=645649870" 
----------------------- 
href="/w/index.php?title=Wikipedia:Getting_to_Philosophy&amp;action=info" 
----------------------- 
href="//www.wikidata.org/wiki/Q14605740" 
----------------------- 
href="/w/index.php?title=Special:Book&amp;bookcmd=book_creator&amp;referer=Wikipedia:Getting+to+Philosophy" 
----------------------- 
href="/w/index.php?title=Special:Book&amp;bookcmd=render_article&amp;arttitle=Wikipedia:Getting+to+Philosophy&amp;oldid=645649870&amp;writer=rdf2latex" 
----------------------- 
href="/w/index.php?title=Wikipedia:Getting_to_Philosophy&amp;printable=yes" 
----------------------- 
href="//fr.wikipedia.org/wiki/Wikip&#xE9;dia:Se_rendre_&#xE0;_l'article_philosophie" 
----------------------- 
href="//uk.wikipedia.org/wiki/&#x412;&#x456;&#x43A;&#x456;&#x43F;&#x435;&#x434;&#x456;&#x44F;:&#x412;&#x441;&#x456;_&#x43F;&#x43E;&#x441;&#x438;&#x43B;&#x430;&#x43D;&#x43D;&#x44F;_&#x432;&#x435;&#x434;&#x443;&#x442;&#x44C;_&#x434;&#x43E;_&#x444;&#x456;&#x43B;&#x43E;&#x441;&#x43E;&#x444;&#x456;&#x457;" 
----------------------- 
href="#" 
----------------------- 
href="//www.wikidata.org/wiki/Q14605740#sitelinks-wikipedia" 
----------------------- 
href="//en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License" 
----------------------- 
href="//creativecommons.org/licenses/by-sa/3.0/" 
----------------------- 
href="//wikimediafoundation.org/wiki/Terms_of_Use" 
----------------------- 
href="//wikimediafoundation.org/wiki/Privacy_policy" 
----------------------- 
href="//www.wikimediafoundation.org/" 
----------------------- 
href="//wikimediafoundation.org/wiki/Privacy_policy" 
----------------------- 
href="/wiki/Wikipedia:About" 
----------------------- 
href="/wiki/Wikipedia:General_disclaimer" 
----------------------- 
href="//en.wikipedia.org/wiki/Wikipedia:Contact_us" 
----------------------- 
href="https://www.mediawiki.org/wiki/Special:MyLanguage/How_to_contribute" 
----------------------- 
href="//en.m.wikipedia.org/w/index.php?title=Wikipedia:Getting_to_Philosophy&amp;mobileaction=toggle_view_mobile" 
----------------------- 
href="//wikimediafoundation.org/" 
----------------------- 
href="//www.mediawiki.org/" 
+0

谢谢!! http://en.wikipedia.org/wiki/Wikipedia:Getting_to_Philosophy我试图解决这个问题,但不幸的是,你的xpath查询不会以正确的顺序处理链接...任何额外的建议? – Wes 2015-02-09 00:05:16

+0

xpath是否按不同顺序处理节点? – Wes 2015-02-09 00:22:48

+0

@Wes不客气。很可能,您的Xpath引擎仅支持XPath 1.0。在版本1.0中,节点被定义为_sets_(让我也指您[Wiki页面](http://en.wikipedia.org/wiki/Set_%28mathematics%29)),这意味着没有特定的顺序他们。但是,引擎通常会以文档顺序返回结果。您必须显示您的所有PHP代码,并发布您当前获得的结果,否则所有投注都将关闭。 – 2015-02-09 00:25:20