如何使用幻影爬虫将控件打印到控制台

我刚刚为nodejs下载并安装了phantom-crawler。我复制并粘贴下面的脚本到一个文件名为crawler.js：如何使用幻影爬虫将控件打印到控制台

var Crawler = require('phantom-crawler'); 

// Can be initialized with optional options object 
var crawler = new Crawler(); 
// queue is an array of URLs to be crawled 
crawler.queue.push('https://google.com/'); 
// Can also do `crawler.fetch(url)` instead of pushing it and crawling it 
// Extract plainText out of each phantomjs page 
Promise.all(crawler.crawl()) 
.then(function(pages) { 
    var texts = []; 
    for (var i = 0; i < pages.length; i++) { 
    var page = pages[i]; 
    // suffix Promise to return promises instead of callbacks 
    var text = page.getPromise('plainText'); 
    texts.push(text); 
    text.then(function(p) { 
     return function() { 
     // Pages are like tabs, they should be closed 
     p.close() 
     } 
    }(page)); 
    } 
    return Promise.all(texts); 
}) 
.then(function(texts) { 
    // texts = array of plaintext from the website bodies 
    // also supports ajax requests 
    console.log(texts); 
}) 
.then(function() { 
    // kill that phantomjs bridge 
    crawler.phantom.then(function (p) { 
    p.exit(); 
    }); 
})

我想打印完整的HTML源代码（在这种情况下，从谷歌网页）到控制台。

我搜索了很多，但我还没有找到类似的东西，所以我该怎么做？

来源

2016-03-27 George Vrynios

获得content而不是plainText的承诺。

模块phantom-crawler使用模块node-phantom-simple，其使用phantomjs。

你可以在phantomjs wiki找到你可以拨打的物业列表。

var Crawler = require('phantom-crawler'); 

// Can be initialized with optional options object 
var crawler = new Crawler(); 
// queue is an array of URLs to be crawled 
crawler.queue.push('https://google.com/'); 
// Can also do `crawler.fetch(url)` instead of pushing it and crawling it 
// Extract plainText out of each phantomjs page 
Promise.all(crawler.crawl()) 
.then(function(pages) { 
    var allHtml = []; 
    for (var i = 0; i < pages.length; i++) { 
    var page = pages[i]; 
    // suffix Promise to return promises instead of callbacks 
    var html = page.getPromise('content'); 
    allHtml.push(html); 
    html.then(function(p) { 
     return function() { 
     // Pages are like tabs, they should be closed 
     p.close() 
     } 
    }(page)); 
    } 
    return Promise.all(allHtml); 
}) 
.then(function(allHtml) { 
    // allHtml = array of plaintext from the website bodies 
    // also supports ajax requests 
    console.log(allHtml); 
}) 
.then(function() { 
    // kill that phantomjs bridge 
    crawler.phantom.then(function (p) { 
    p.exit(); 
    }); 
})

来源

2016-03-27 16:05:22 MasterT

感谢您的详细解答。这非常有帮助。 –

不客气，但你知道我有理由检查源代码;）！ – MasterT

我是节点js技术的新手，我尝试了解所有事情如何一起工作，并相信我检查了源代码，但我不明白咖啡脚本。 –

如何使用幻影爬虫将控件打印到控制台

回答

相关问题