2015-10-07 54 views
2

此脚本需要2秒才能完成,但如何在多线程中运行它,并在50毫秒如何在多线程运行脚本或多重处理

import urllib2                                
from threading import Thread                      
def btl_test(url):                                                   
    page = urllib2.urlopen(url) 
    print page                            


url = ["http://google.com","http://example.com","http://yahoo.com","http://linkedin.com","http://orkut.com","http://quora.com","http://facebook.com","http://myspace.com","http://gmail.com","http://nltk.org","http://cyber.com"] 
for i in url:                               
    t = Thread(target = btl_test,args=(i,))                           
    t.start() 

完成如何把结果顺序呢?

回答

2
from contextlib import closing # http://stackoverflow.com/a/25968716/968442 
from multiprocessing.pool import Pool 

with closing(Pool(len(url))) as pool: 
    pool.map(btl_test, url) 

应该是方便的代码片段。关于顺序,您可以使用元组分配映射并相应地打印它们。


更新:每

由于这blogpool.map将返回的输出与保存的顺序。下面是它打印元组中(URL,html_content)格式列表但不更改顺序

urls = ["http://google.com","http://example.com","http://yahoo.com","http://linkedin.com","http://orkut.com","http://quora.com","http://facebook.com","http://myspace.com","http://gmail.com","http://nltk.org","http://cyber.com"] 

def btl_test(url): 
    import urllib2 
    return url, urllib2.urlopen(url).read() 

from contextlib import closing # http://stackoverflow.com/a/25968716/968442 
from multiprocessing.pool import Pool 

with closing(Pool(len(urls))) as pool: 
    result = pool.map(btl_test, urls) 

print result 
+1

请你这个映射为了得到,因为我无法理解? – Mounarajan

+1

更新了应该帮助的答案。 – nehemiah

1

尝试使用Queue()enumerate存储顺序码。

import threading 
import requests 
import Queue 

class UrlReader(threading.Thread): 
    def __init__(self, queue, output): 
     super(UrlReader, self).__init__() 
     self.setDaemon = True 
     self.queue = queue 
     self.output = output 

    def run(self): 
     while True: 
      try: 
       target = self.queue.get(block=False) 
       data = requests.get(target[1]) 
       print data.status_code 
       if data.status_code == 200: 
        self.queue.task_done() 
        self.output.put((data.url, target[0]), block=False) 
       else: 
        self.queue.task_done() 
        self.queue.put(target) 
      except Queue.Empty: 
       break 
      except requests.exceptions.ConnectionError: 
       self.queue.task_done() 
       self.queue.put(target) 


def load(urlrange, num_threads): 
    mainqueue = Queue.Queue() 
    outq = Queue.Queue() 
    mythreads = [] 

    for url in urlrange: 
     mainqueue.put(url) 

    for j in xrange(num_threads): 
     mythreads.append(UrlReader(mainqueue, outq)) 
     mythreads[-1].start() 

    mainqueue.join() 
    for j in xrange(num_threads): 
     mythreads.append(UrlReader(mainqueue, outq)) 
     mythreads[j].join() 
    return list(outq.__dict__['queue']) 

urls = ["http://google.com","http://example.com","http://yahoo.com","http://linkedin.com","http://orkut.com","http://quora.com","http://facebook.com","http://myspace.com","http://gmail.com","http://nltk.org","http://cyber.com"] 

print load(enumerate(urls), 10) 

>>> [(6, 'http://facebook.com'), (9, 'http://nltk.org'), (0, 'http://google.com'), (1, 'http://example.com'), (2, 'http://yahoo.com'), (3, 'http://linkedin.com'), (4, 'http://orkut.com'), (5, 'http://quora.com'), (7, 'http://myspace.com'), (8, 'http://gmail.com'), (10, 'http://cyber.com')] 
+0

嗨,你能帮我理解这个脚本通过添加文档(评论)请吗? – Mounarajan

1

这工作

from urlparse import urlparse 
from multiprocessing.pool import Pool 
import re 
import urllib2 

def btl_test(url):                                                   
    page = urllib2.urlopen(url).read() 
    if (re.findall(r'<title>(.*?)<\/title>',page)): 
     page1 = (re.findall(r'<title>(.*?)<\/title>',page)[0]) 
     print page1 

url = ["http://google.com","http://example.com","http://yahoo.com","http://linkedin.com","http://facebook.com","http://orkut.com","http://oosing.com","http://pinterets.com","http://orkut.com","http://quora.com","http://facebook.com","http://myspace.com","http://gmail.com","http://nltk.org","http://cyber.com"] 


#for i in url: 
# print btl_test(i) 
nprocs = 2 # nprocs is the number of processes to run 
ParsePool = Pool(nprocs) 
ParsePool.map(btl_test,url) 
#ParsedURLS = ParsePool.map(btl_test,url) 
#print ParsedURLS 

有很大帮助

相关问题