2016-10-05 44 views
0

所以我不知道如何处理这种情况。它几乎适用于许多其他损坏的链接,但不是这一个:图像下载被卡在一个不存在的链接

import datetime 
import praw 
import re 
import urllib 
import requests 
from bs4 import BeautifulSoup 




sub = 'dog' 
imgurUrlPattern = re.compile(r'(http://i.imgur.com/(.*))(\?.*)?') 
r = praw.Reddit(user_agent = "download all images from a subreddit", 
     user_site = "lamiastella") 
already_done = [] 
#checkWords = ['i.imgur.com', 'jpg', 'png',] 
check_words = ['jpg', 'jpeg', 'png'] 

subreddit = r.get_subreddit(sub) 
for submission in subreddit.get_top_from_all(limit=10000): 
#for submission in subreddit.get_hot(limit=10000): 
     is_image = any(string in submission.url for string in check_words) 
     print '[LOG] Getting url: ' + submission.url 
     if submission.id not in already_done and is_image: 
     if submission.url.endswith('/'): 
     modified_url = submission.url[:len(submission.url)-1] 
       try: 
      urllib.urlretrieve(modified_url, '/home/jalal/computer_vision/image_retrieval/images/' + datetime.datetime.now().strftime('%y-%m-%d-%s') + modified_url[-5:]) 
      except Exception as e: 
      print(e) 
      #pass 
      continue 
      else: 
     try: 
       urllib.urlretrieve(submission.url, '/home/jalal/computer_vision/image_retrieval/images/' + datetime.datetime.now().strftime('%y-%m-%d-%s') + submission.url[-5:]) 
     except Exception as e: 
      print(e) 
      #pass 
      continue 

      already_done.append(submission.id) 
      print '[LOG] Done Getting ' + submission.url 
      print('{0}: {1}'.format('submission id is', submission.id)) 
     elif 'imgur.com' in submission.url and not (submission.url.endswith('gif') 
          or submission.url.endswith('webm') 
          or submission.url.endswith('mp4') 
          or submission.url.endswith('all') 
          or '#' in submission.url 
          or '/a/' in submission.url): 
      # This is an Imgur page with a single image. 
      html_source = requests.get(submission.url).text # download the image's page 
      soup = BeautifulSoup(html_source, "lxml") 
      image_url = soup.select('img')[0]['src'] 
      if image_url.startswith('//'): 
        # if no schema is supplied in the url, prepend 'http:' to it 
        image_url = 'http:' + image_url 
       image_id = image_url[image_url.rfind('/') + 1:image_url.rfind('.')] 
         urllib.urlretrieve(image_url, '/home/jalal/computer_vision/image_retrieval/images/' + 'imgur_'+ datetime.datetime.now().strftime('%y-%m-%d-%s') + submission.url[-9:0]) 
    elif 'instagram.com' in submission.url: 
     html_source = requests.get(submission.url).text 
     soup = BeautifulSoup(html_source, "lxml") 
     instagram_url = soup.find('meta', {"property":"og:image"})['content'] 
     urllib.urlretrieve(instagram_url, '/home/jalal/computer_vision/image_retrieval/images/' + 'instagram_'+ datetime.datetime.now().strftime('%y-%m-%d-%s') + '.jpg') 
    else: 
     continue 

我陷在链路 http://cutearoo.com/wp-content/uploads/2011/04/Pomsky.png 并具有CTL + C它:

[LOG] Done Getting http://i.imgur.com/Vc9P9QC.jpg 
submission id is: 1fv70j 
[LOG] Getting url: http://i.imgur.com/iOBi0qx.jpg 
[LOG] Done Getting http://i.imgur.com/iOBi0qx.jpg 
submission id is: 1dof3o 
[LOG] Getting url: http://cutearoo.com/wp-content/uploads/2011/04/Pomsky.png 
^CTraceback (most recent call last): 
    File "download_images.py", line 35, in <module> 
    urllib.urlretrieve(submission.url, '/home/jalal/computer_vision/image_retrieval/images/' + datetime.datetime.now().strftime('%y-%m-%d-%s') + submission.url[-5:]) 
    File "/usr/lib/python2.7/urllib.py", line 98, in urlretrieve 
    return opener.retrieve(url, filename, reporthook, data) 
    File "/usr/lib/python2.7/urllib.py", line 245, in retrieve 
    fp = self.open(url, data) 
    File "/usr/lib/python2.7/urllib.py", line 213, in open 
    return getattr(self, name)(url) 
    File "/usr/lib/python2.7/urllib.py", line 350, in open_http 
    h.endheaders(data) 
    File "/usr/lib/python2.7/httplib.py", line 1053, in endheaders 
    self._send_output(message_body) 
    File "/usr/lib/python2.7/httplib.py", line 897, in _send_output 
    self.send(msg) 
    File "/usr/lib/python2.7/httplib.py", line 859, in send 
    self.connect() 
    File "/usr/lib/python2.7/httplib.py", line 836, in connect 
    self.timeout, self.source_address) 
    File "/usr/lib/python2.7/socket.py", line 566, in create_connection 
    sock.connect(sa) 
    File "/usr/lib/python2.7/socket.py", line 228, in meth 
    return getattr(self._sock,name)(*args) 
KeyboardInterrupt 

请建议修复此。

更新: 我使用的是这样的:

image_file = urllib2.urlopen(modified_url) 
with open('/home/jalal/computer_vision/image_retrieval/images/' + datetime.datetime.now().strftime('%y-%m-%d-%s') + modified_url[-5:], 'wb') as output_image: 
           output_image.write(image_file.read()) 

,仍然会被卡住的特定链接。

+2

的可能的复制[超时使用Python的urllib文件下载?](http://stackoverflow.com/questions/32763720/timeout-a-file-download-with-python-urllib) – zvone

+0

@zvone请检查更新! –

回答

1

使用urlopentimeout说法:

>>> import urllib2 
>>> modified_url = 'http://cutearoo.com/wp-content/uploads/2011/04/Pomsky.png' 
>>> try: 
...  image_file = urllib2.urlopen(modified_url, timeout=5) 
... except urllib2.URLError: 
...  print 'could not download :(' 
... 
could not download :(
>>> 

答案以上正确:)只是添加了什么我已经根据您的回答为好;

image_file = urllib2.urlopen(modified_url) 
       with open('/home/jalal/computer_vision/image_retrieval/'+category+'/' + datetime.datetime.now().strftime('%y-%m-%d-%s') + modified_url[-5:], 'wb') as output_image: 
        output_image.write(image_file.read(), timeout = 5) 
       except urllib2.URLError as e: 
        print(e) 
        continue 
相关问题