以下脚本包含links
数组中的一些URL。函数gatherLinks()
用于从links
数组中的URL的sitemap.xml中收集更多URL。一旦links
数组有足够的URL(由变量limit
决定),函数request()
被调用,每个URL在links
数组中向服务器发送请求,获取响应并使用page.render()
函数保存图像。PhantomJS 2.0.0不会等待页面加载
问题是,当我使用PhantomJS 2.0.0运行它时,许多图像缺乏很多内容,即PhantomJS可能不会等待所有内容加载。但是当我使用PhantomJS 1.9.8时,所有内容都可以正常加载。可能是什么原因?
var webpage = require('webpage');
var system = require('system');
var fs = require('fs');
var links = [];
links = [
"http://somesite.com",
"http://someothersite.com",
.
.
.
];
var index = 0, fail = 0, limit = 20;
finalTime = Date.now();
var gatherLinks = function(link){
var page = webpage.create();
link = link + "/sitemap.xml";
console.log("Fetching links from " + link);
page.open(link, function(status){
if(status != "success"){
console.log("Sitemap Request FAILED, status: " + status);
fail++;
return;
}
var content = page.content;
parser = new DOMParser();
xmlDoc = parser.parseFromString(content, 'text/xml');
var loc = xmlDoc.getElementsByTagName('loc');
for(var i = 0; i < loc.length; i++){
if(links.length < limit){
links[links.length] = loc[i].textContent;
} else{
console.log(links.length + " Links prepared. Starting requests.\n");
index = 0;
page.close();
request();
return;
}
}
if(index >= links.length){
index = 0;
console.log(links.length + " Links prepared\n\n");
page.close();
request();
return;
}
page.close();
gatherLinks(links[++index]);
});
};
var request = function(){
t = Date.now();
var page = webpage.create();
page.open(links[index], function(status) {
console.log('Loading link #' + (index + 1) + ': ' + links[index]);
console.log("Time taken: " + (Date.now() - t) + " msecs");
if(status != "success"){
console.log("Request FAILED, status: " + status);
fail++;
}
page.render("img_200_" + index + ".jpeg", {format: 'jpeg', quality: '100'});
if(index >= links.length-1){
console.log("\n\nAll links done, final time taken: " + (Date.now() - finalTime) + " msecs");
console.log("Requests sent: " + links.length + ", Failures: " + fail);
console.log("Success ratio: " + ((links.length - fail)/links.length)*100 + "%");
page.close();
phantom.exit();
}
index++;
page.close();
request();
});
}
gatherLinks(links[0]);
我同样有很多麻烦让PhantomJS和CasperJS等待整页加载。我试图遵循这个建议:http://stackoverflow.com/a/27472788/470749 – Ryan