2016-03-27 31 views
1

我刚刚为nodejs下载并安装了phantom-crawler。我复制并粘贴下面的脚本到一个文件名为crawler.js:如何使用幻影爬虫将控件打印到控制台

var Crawler = require('phantom-crawler'); 

// Can be initialized with optional options object 
var crawler = new Crawler(); 
// queue is an array of URLs to be crawled 
crawler.queue.push('https://google.com/'); 
// Can also do `crawler.fetch(url)` instead of pushing it and crawling it 
// Extract plainText out of each phantomjs page 
Promise.all(crawler.crawl()) 
.then(function(pages) { 
    var texts = []; 
    for (var i = 0; i < pages.length; i++) { 
    var page = pages[i]; 
    // suffix Promise to return promises instead of callbacks 
    var text = page.getPromise('plainText'); 
    texts.push(text); 
    text.then(function(p) { 
     return function() { 
     // Pages are like tabs, they should be closed 
     p.close() 
     } 
    }(page)); 
    } 
    return Promise.all(texts); 
}) 
.then(function(texts) { 
    // texts = array of plaintext from the website bodies 
    // also supports ajax requests 
    console.log(texts); 
}) 
.then(function() { 
    // kill that phantomjs bridge 
    crawler.phantom.then(function (p) { 
    p.exit(); 
    }); 
}) 

我想打印完整的HTML源代码(在这种情况下,从谷歌网页)到控制台。

我搜索了很多,但我还没有找到类似的东西,所以我该怎么做?

回答

1

获得content而不是plainText的承诺。

模块phantom-crawler使用模块node-phantom-simple,其使用phantomjs

你可以在phantomjs wiki找到你可以拨打的物业列表。

var Crawler = require('phantom-crawler'); 

// Can be initialized with optional options object 
var crawler = new Crawler(); 
// queue is an array of URLs to be crawled 
crawler.queue.push('https://google.com/'); 
// Can also do `crawler.fetch(url)` instead of pushing it and crawling it 
// Extract plainText out of each phantomjs page 
Promise.all(crawler.crawl()) 
.then(function(pages) { 
    var allHtml = []; 
    for (var i = 0; i < pages.length; i++) { 
    var page = pages[i]; 
    // suffix Promise to return promises instead of callbacks 
    var html = page.getPromise('content'); 
    allHtml.push(html); 
    html.then(function(p) { 
     return function() { 
     // Pages are like tabs, they should be closed 
     p.close() 
     } 
    }(page)); 
    } 
    return Promise.all(allHtml); 
}) 
.then(function(allHtml) { 
    // allHtml = array of plaintext from the website bodies 
    // also supports ajax requests 
    console.log(allHtml); 
}) 
.then(function() { 
    // kill that phantomjs bridge 
    crawler.phantom.then(function (p) { 
    p.exit(); 
    }); 
}) 
+0

感谢您的详细解答。这非常有帮助。 –

+0

不客气,但你知道我有理由检查源代码;)! – MasterT

+0

我是节点js技术的新手,我尝试了解所有事情如何一起工作,并相信我检查了源代码,但我不明白咖啡脚本。 –