2016-04-15 26 views
2

首先,我已成功安装PhantomJs和它的npm接口phantom。我已经设置了用新语法加载我的页面的代码(这里发布的所有其他问题都是基于旧的代码语法,或者我错过了一些东西)。 this is the source I'm trying to scrape使用nodejs和phantomjs动态抓取

现在,右边栏,假选择靠近“公爵”和另一个的右边栏是动态生成的,我不明白为什么phantomjs没有选择它们。我的代码如下:

var sito = "http://bicincitta.tobike.it/"; 
var sitepage = null; 
var phInstance = null; 
var phantom = require('phantom') 

phantom.create() 
    .then((instance) => { 
    phInstance = instance; 
return instance.createPage(); 
}) 
.then((page) => { 
    sitepage = page; 
return page.open(sito); 
}) 
.then((status) => { 
    console.log(status); 
return sitepage.property('content'); 
}) 
.then((content) => { 
    console.log(content); 
sitepage.close(); 
phInstance.exit(); 
}) 
.catch((error) => { 
    console.log(error); 
phInstance.exit(); 
}) 

我现在在墙上打了我的头。我应该以某种方式获取网站的脚本并执行它们?我错过了一条指令吗?

另外,在旁注上;如果页面在第二个“.then”的范围内,我应该如何将其他方法连接到页面并不是很清楚。

回答

1

我已经花了上周与PhantomJS工作,试图让它使用角度呈现的数据快照页面。我发现的最简单的方法是对任何本地脚本使用page.injectJs('../script.js'),对于任何外部脚本使用page.includeJs('http://jquery.com...')。由于Phantom是沙盒,它不会在它捕获的页面上执行JavaScript,除非您给它执行JS。这将允许您截取具有使用javascript呈现数据的页面。

1

在HTML的底部有一个CData脚本,它不能被幻影解析。这是项目正在传播的地方。

<script type="text/javascript"> 
//<![CDATA[ 
Sys.Application.initialize(); 
Sys.Application.add_init(function() { 
    $create(Telerik.Web.UI.RadAjaxManager, {"_updatePanels":"","ajaxSettings":[],"clientEvents":{OnRequestStart:"",OnResponseEnd:""},"defaultLoadingPanelID":"","enableAJAX":true,"enableHistory":false,"links":[],"styles":[],"uniqueID":"RadAjaxManager1","updatePanelsRenderMode":0}, null, null, $get("RadAjaxManager1")); 
}); 
Sys.Application.add_init(function() { 
    $create(Telerik.Web.UI.RadAjaxPanel, {"clientEvents":{OnRequestStart:"",OnResponseEnd:""},"enableAJAX":true,"enableHistory":false,"links":[],"loadingPanelID":"","styles":[],"uniqueID":"ajCheckLoginUser"}, null, null, $get("ajCheckLoginUser")); 
}); 
Sys.Application.add_init(function() { 
    $create(Telerik.Web.UI.RadAjaxPanel, {"clientEvents":{OnRequestStart:"",OnResponseEnd:""},"enableAJAX":true,"enableHistory":false,"links":[],"loadingPanelID":"","styles":[],"uniqueID":"ajCheckLoginAdmin"}, null, null, $get("ajCheckLoginAdmin")); 
}); 
Sys.Application.add_init(function() { 
    $create(Telerik.Web.UI.RadAjaxPanel, {"clientEvents":{OnRequestStart:"",OnResponseEnd:""},"enableAJAX":true,"enableHistory":false,"links":[],"loadingPanelID":"","styles":[],"uniqueID":"ajLogoutUser"}, null, null, $get("ajLogoutUser")); 
}); 
Sys.Application.add_init(function() { 
    $create(Telerik.Web.UI.RadWindow, {"_dockMode":false,"behaviors":0,"clientStateFieldID":"radPortal_ClientState","destroyOnClose":true,"formID":"form1","height":"180px","iconUrl":"","left":"","minimizeIconUrl":"","modal":true,"name":"radPortal","reloadOnShow":true,"showContentDuringLoad":false,"skin":"Office2007","top":"","visibleStatusbar":false,"width":"450px"}, {"close":OnClientClosePortal}, null, $get("radPortal")); 
}); 
Sys.Application.add_init(function() { 
    $create(Telerik.Web.UI.RadWindowManager, {"behaviors":4,"clientStateFieldID":"windowManagerPortal_ClientState","destroyOnClose":true,"formID":"form1","iconUrl":"","left":"","minimizeIconUrl":"","modal":true,"name":"windowManagerPortal","reloadOnShow":true,"showContentDuringLoad":false,"skin":"Office2007","top":"","visibleStatusbar":false,"windowControls":"['radPortal']"}, null, {"child":"radPortal"}, $get("windowManagerPortal")); 
    }); 
//]]> 
</script> 

只要您离开与此站点服务器的通信,这些项目也将被销毁。有办法解决这个问题,但我认为你最好还是尝试别的。我用npm cheerio加载CDATA html