2017-03-20 25 views
1

我正在使用此库的多处理器from multiprocessing import PoolPython:如何有效地运行多个PhantomJS实例?

虽然我使用requests,我想一些数据在弹出加载使用硒。没有进入内存泄漏,使用Phantomjs最好的方法是什么?

+0

用'maxInstances'设置一个硒网格设置为每个节点可以处理的东西,这样您可以根据需要添加节点?你在找多少个实例?每分钟有多少个请求?如果这不是一种选择,或许可以考虑重用硒会议并在他们提出请求时轮流通过它们? – jmunsch

+0

@ jmunsch不知道'Selenium Grid'。因为我愿意一次使用5个实例的并行处理。每个请求将有2-5秒的延迟。 – Volatil3

+0

@jmunsch第二,我需要一个基于服务器的解决方案,这个网格似乎要安装Java – Volatil3

回答

1

的基本思路大致翻译可能是这样的:

from __future__ import unicode_literals 
import logging 
from werkzeug.routing import Map 
from werkzeug.exceptions import HTTPException 
from werkzeug.wrappers import Request 
class WebApp(object): 

    def __init__(self, **kw): 
     self.log = logging.getLogger(__name__) 

    def __call__(self, environ, start_response): 
     return self.wsgi_app(environ, start_response) 

    def wsgi_app(self, environ, start_response): 
     request = Request(environ) 
     response = self.dispatch_request(request) 
     return response(environ, start_response) 

    def dispatch_request(self, request): 
     adapter = self.url_map.bind_to_environ(request.environ) 
     try: 
      endpoint, values = adapter.match() 
      method = getattr(self, 'endpoint_{}'.format(endpoint)) 
      return method(adapter, request, **values) 
     except HTTPException, e: 
      return e 

    url_map = Map([]) 


from pyvirtualdisplay import Display 
from selenium import webdriver 
from selenium.webdriver.common.keys import Keys 
from subprocess import Popen, PIPE 
import multiprocessing 
display = Display(visible=0, size=(800, 600)) 
display.start() 

def get_proxy_obj(): 
    proxy = '123.456.789.012' 

    proxyobj = Proxy({ 
     'proxyType': ProxyType.MANUAL, 
     'httpProxy': proxy, 
     'ftpProxy': proxy, 
     'sslProxy': proxy, 
     'noProxy': '' # set this value as desired 
    }) 
    capabilities = DesiredCapabilities().FIREFOX 
    capabilities['acceptSslCerts'] = True 
    proxyobj.add_to_capabilities(capabilities) 
    return capabilities 





drivers = [ 
    Firefox(FirefoxProfile('/etc/firefox/u2vgyy61.Proxied_User/'), 
      capabilities=get_capabilities()), 
    Firefox(FirefoxProfile('/etc/firefox/u2vgyy61.Proxied_User/'), 
      capabilities=get_capabilities()), 
    Firefox(FirefoxProfile('/etc/firefox/u2vgyy61.Proxied_User/'), 
      capabilities=get_capabilities()) 
] 

class Routes(WebApp): 
    def endpoint_get_response(self, adapter, request, **values): 
     url = request.values.get("query_param_here","") 
     if url: 
      # something better here 
      while True: 
       try: 
        driver = driver.pop() 
        resposne_txt = driver.get(url) 
        # response_txt = Popen(['docker', "exec", "-it", "selenium_phantom", url]).communicate()[0] 
        drivers.append(driver) 
        return Response(response_text) 
       except: 
        sleep(1) 
        continue 

     else: 
      return Response("Not", status=400) 

    url_map = Map([ 
      Rule('/get_response', endpoint='get_response', methods=['GET']), 
     ]) 

例如用法:

curl http://node1/get_response?query_param_here=http://stackoverflow.com 
curl http://node2/get_response?query_param_here=http://stackoverflow.com 
curl http://node3/get_response?query_param_here=http://stackoverflow.com 
curl http://node4/get_response?query_param_here=http://stackoverflow.com 
... 
and so on 

与负载均衡器盈,如: