2012-10-12 30 views
2

我试图解析与QtWebKit的JS生成的网页,我发现如何让网页的源文件的例子:如何在python线程中使用qtwebkit?

import sys 
from PySide.QtGui import * 
from PySide.QtCore import * 
from PySide.QtWebKit import * 
class Render(QWebPage): 
    def __init__(self, url): 
     self.app = QApplication(sys.argv) 
     QWebPage.__init__(self) 
     self.loadFinished.connect(self._loadFinished) 
     self.mainFrame().load(QUrl(url)) 
     self.app.exec_() 

    def _loadFinished(self, result): 
     self.frame = self.mainFrame() 
     self.app.quit() 
url = 'http://www.thesite.gov/search' 
r = Render(url) 
html = r.frame.toHtml() 

但我不知道如何使它在线程工作。 那么,如何做到这一点,如果这是不可能的 - 是否有另一种快速的方式来获取由js生成的网页?

回答

3

鉴于QT的异步性质,QtWebkit方法也是非阻塞的,所以在线程中没有运行它们的意思。你可以像这样并行启动它们:

from functools import partial 

from PySide.QtCore import QUrl 
from PySide.QtGui import QApplication 
from PySide.QtWebKit import QWebView, QWebSettings 


TARGET_URLS = (
    'http://stackoverflow.com', 
    'http://github.com', 
    'http://bitbucket.org', 
    'http://news.ycombinator.com', 
    'http://slashdot.org', 
    'http://www.reddit.com', 
    'http://www.dzone.com', 
    'http://www.ideone.com', 
    'http://jsfiddle.net', 
) 


class Crawler(object): 

    def __init__(self, app): 
     self.app = app 
     self.results = dict() 
     self.browsers = dict() 

    def _load_finished(self, browser_id, ok): 
     print ok, browser_id 
     web_view, _flag = self.browsers[browser_id] 
     self.browsers[browser_id] = (web_view, True) 

     frame = web_view.page().mainFrame() 
     self.results[frame.url()] = frame.toHtml() 

     web_view.loadFinished.disconnect() 
     web_view.stop() 

     if all([closed for bid, closed in self.browsers.values()]): 
      print 'all finished' 
      self.app.quit() 

    def start(self, urls): 
     for browser_id, url in enumerate(urls): 
      web_view = QWebView() 
      web_view.settings().setAttribute(QWebSettings.AutoLoadImages, 
              False) 
      loaded = partial(self._load_finished, browser_id) 
      web_view.loadFinished.connect(loaded) 
      web_view.load(QUrl(url)) 
      self.browsers[browser_id] = (web_view, False) 


if __name__ == '__main__': 
    app = QApplication([]) 
    crawler = Crawler(app) 
    crawler.start(TARGET_URLS) 
    app.exec_() 
    print 'got:', crawler.results.keys() 
+0

如何用PySide或PtQt解析网页(html + js)?这些代码似乎无法解析JS – user1179442

+1

@ user1179442:它可以通过:'QWebView()。settings()。setAttribute(QWebSettings.JavascriptEnabled,False)' – andrean

+0

感谢您的信息。但第二个参数应该是“真”:) – user1179442