2016-07-14 36 views
0

受到this question的接受答案的启发我试图用类似requests的界面来包装PyCurl。 Everythig会很好,但在按照PyCURL docs描述如何从头文件读取正文编码后,我遇到了以下问题。每个响应标题都会调用标题回调,但是只有在迭代器开始产生响应行后,才会使编码/字符集检测变得毫无意义。PyCURL正在处理标题之前

下面的代码:

import re 
import io 
import urllib 
import urllib.error 
import http 

import pycurl 


class CurlHTTPStream(object): 

    SELECT_TIMEOUT = 10 
    HTTP_STANDARD_ENCODING = 'iso-8859-1' 

    def __init__(self, method, url, data=None, params=None, headers=None): 
     self.url = url 
     self.received_buffer = io.BytesIO() 

     self.curl = pycurl.Curl() 
     self.curl.setopt(pycurl.CUSTOMREQUEST, method) 
     if headers: 
      self.curl.setopt(
       pycurl.HTTPHEADER, 
       [ 
        '{}: {}'.format(key, value) 
        for key, value in headers.items() 
       ] 
      ) 
     if params: 
      query_string = '&'.join((
       '{}={}'.format(key, value) 
       for key, value in params.items() 
      )) 
      url = '{}?{}'.format(url, query_string) 
     self.curl.setopt(pycurl.URL, url) 
     self.curl.setopt(pycurl.ENCODING, 'gzip') 
     self.curl.setopt(pycurl.CONNECTTIMEOUT, 5) 
     self.curl.setopt(pycurl.HEADERFUNCTION, self.header_function) 
     self.curl.setopt(pycurl.WRITEFUNCTION, self.received_buffer.write) 

     self.curl_multi = pycurl.CurlMulti() 
     self.curl_multi.add_handle(self.curl) 

     self.status_code = 0 
     self.headers = {} 

    def _any_data_received(self): 
     return self.received_buffer.tell() != 0 

    def _get_received_data(self): 
     result = self.received_buffer.getvalue() 
     self.received_buffer.truncate(0) 
     self.received_buffer.seek(0) 
     return result 

    def _check_status_code(self): 
     if self.status_code == 0: 
      self.status_code = self.curl.getinfo(pycurl.HTTP_CODE) 
     if self.status_code != 0 and self.status_code != http.HTTPStatus.OK: 
      raise urllib.error.HTTPError(
       self.url, self.status_code, None, None, None 
      ) 

    def _perform_on_curl(self): 
     while True: 
      ret, num_handles = self.curl_multi.perform() 
      if ret != pycurl.E_CALL_MULTI_PERFORM: 
       break 
     return num_handles 

    def _iter_chunks(self): 
     while True: 
      remaining = self._perform_on_curl() 
      if self._any_data_received(): 
       self._check_status_code() 
       yield self._get_received_data() 
      if remaining == 0: 
       break 
      self.curl_multi.select(self.SELECT_TIMEOUT) 

     self._check_status_code() 
     self._check_curl_errors() 

    def _check_curl_errors(self): 
     for f in self.curl_multi.info_read()[2]: 
      raise pycurl.error(*f[1:]) 

    def iter_lines(self): 
     chunks = self._iter_chunks() 
     return self._split_lines_from_chunks(chunks) 

    def _split_lines_from_chunks(self, chunks): 
     print('foo') 
     print(self.headers) 
     charset = None 
     if 'content-type' in self.headers: 
      content_type = self.headers['content-type'].lower() 
      match = re.search('charset=(\S+)', content_type) 
      if match: 
       charset = match.group(1) 
       print('Decoding using %s' % charset) 
     if charset is None: 
      charset = self.HTTP_STANDARD_ENCODING 
      print('Assuming encoding is %s' % charset) 
     pending = None 
     for chunk in chunks: 
      if pending is not None: 
       chunk = pending + chunk 
      lines = chunk.splitlines() 
      if lines and lines[-1] and chunk and lines[-1][-1] == chunk[-1]: 
       pending = lines.pop() 
      else: 
       pending = None 
      for line in lines: 
       yield line.decode(charset) 
     if pending is not None: 
      yield pending.decode(charset) 

    def header_function(self, header_line): 
     print('hello') 
     header_line = header_line.decode(self.HTTP_STANDARD_ENCODING) 
     if ':' not in header_line: 
      return 
     name, value = header_line.split(':', 1) 
     name = name.strip() 
     value = value.strip() 
     name = name.lower() 
     self.headers[name] = value 


def request(method, url, data=None, params=None, headers=None, 
      stream=False): 
    if stream: 
     return CurlHTTPStream(method, url, data=data, params=params, 
           headers=headers) 

这就是在终端发生了什么,当我尝试测试:

Python 3.5.1 (default, Dec 09 2015, 07:29:36) [GCC] on linux 
Type "help", "copyright", "credits" or "license" for more information. 
>>> from pycurl_requests.requests import request 
>>> r = request('GET', 'http://my-couchdb-instance:5984/user-30323561366530622d336135622d343637372d386464392d613038653536663865636566/_changes', params={'feed': 'continuous'}, stream=True) 
>>> for l in r.iter_lines(): 
...  print(l) 
... 
foo 
{} 
Assuming encoding is iso-8859-1 
hello 
hello 
hello 
hello 
hello 
hello 
hello 
{"seq":1,"id":"account","changes":[{"rev":"1-806053b347406e04d1872e13199fd3cf"}]} 
{"seq":4,"id":"identity-bd2c5007-9df3-4ece-9751-843bf5523edd","changes":[{"rev":"1-e3a98ec37776f2cb479b2dcae0266700"}]} 
{"seq":5,"id":"section_phone-0342667c-ecbd-401f-acfe-7bb2a1aa3159","changes":[{"rev":"1-457342bc895c7cb6924ceabd07e1ffcf"}]} 

有更多的线路从CouchDB的变化来养活,但我截断输出因为它们不相关。

在输出中基本上foo表示它进入了期望标题的块,但下一行显示self.headers为空。并且多个hello代表每次致电header_function()。在头部回调被触发之前,如何调用将主体写入BytesIO的写回调?

回答

0

我找到了解决方案。问题在于_split_lines_from_chunks(self, chunks)在响应之前发生了任何反应,所以标题还没有出现。

下面是可用的代码。当第一行正文可用时,charset被检测到,所以我已经确认了所有的头文件。

import re 
import io 
import urllib 
import urllib.error 
import http 

import pycurl 


class CurlHTTPStream(object): 

    SELECT_TIMEOUT = 10 
    HTTP_STANDARD_ENCODING = 'iso-8859-1' 

    def __init__(self, method, url, data=None, params=None, headers=None): 
     self.url = url 
     self.received_buffer = io.BytesIO() 

     self.curl = pycurl.Curl() 
     self.curl.setopt(pycurl.CUSTOMREQUEST, method) 
     if headers: 
      self.curl.setopt(
       pycurl.HTTPHEADER, 
       [ 
        '{}: {}'.format(key, value) 
        for key, value in headers.items() 
       ] 
      ) 
     if params: 
      query_string = '&'.join((
       '{}={}'.format(key, value) 
       for key, value in params.items() 
      )) 
      url = '{}?{}'.format(url, query_string) 
     self.curl.setopt(pycurl.URL, url) 
     self.curl.setopt(pycurl.ENCODING, 'gzip') 
     self.curl.setopt(pycurl.CONNECTTIMEOUT, 5) 
     self.curl.setopt(pycurl.HEADERFUNCTION, self.header_function) 
     self.curl.setopt(pycurl.WRITEFUNCTION, self.received_buffer.write) 

     self.curl_multi = pycurl.CurlMulti() 
     self.curl_multi.add_handle(self.curl) 

     self.status_code = 0 
     self.headers = {} 
     self._charset = None 

    def _any_data_received(self): 
     return self.received_buffer.tell() != 0 

    def _get_received_data(self): 
     result = self.received_buffer.getvalue() 
     self.received_buffer.truncate(0) 
     self.received_buffer.seek(0) 
     return result 

    def _check_status_code(self): 
     if self.status_code == 0: 
      self.status_code = self.curl.getinfo(pycurl.HTTP_CODE) 
     if self.status_code != 0 and self.status_code != http.HTTPStatus.OK: 
      raise urllib.error.HTTPError(
       self.url, self.status_code, None, None, None 
      ) 

    def _perform_on_curl(self): 
     while True: 
      ret, num_handles = self.curl_multi.perform() 
      if ret != pycurl.E_CALL_MULTI_PERFORM: 
       break 
     return num_handles 

    def _iter_chunks(self): 
     while True: 
      remaining = self._perform_on_curl() 
      if self._any_data_received(): 
       self._check_status_code() 
       yield self._get_received_data() 
      if remaining == 0: 
       break 
      self.curl_multi.select(self.SELECT_TIMEOUT) 

     self._check_status_code() 
     self._check_curl_errors() 

    def _check_curl_errors(self): 
     for f in self.curl_multi.info_read()[2]: 
      raise pycurl.error(*f[1:]) 

    def iter_lines(self): 
     chunks = self._iter_chunks() 
     return self._split_lines_from_chunks(chunks) 

    def _split_lines_from_chunks(self, chunks): 
     print('foo') 
     print(self.headers) 
     pending = None 
     for chunk in chunks: 
      if pending is not None: 
       chunk = pending + chunk 
      lines = chunk.splitlines() 
      if lines and lines[-1] and chunk and lines[-1][-1] == chunk[-1]: 
       pending = lines.pop() 
      else: 
       pending = None 
      for line in lines: 
       yield line.decode(self.charset) 
     if pending is not None: 
      yield pending.decode(self.charset) 

    @property 
    def charset(self): 
     if self._charset is not None: 
      return self._charset 
     try: 
      content_type = self.headers['content-type'].lower() 
      match = re.search('charset=(\S+)', content_type) 
      if match: 
       self._charset = match.group(1).strip() 
       print('Decoding using %s' % self._charset) 
      else: 
       raise KeyError('charset') 
     except KeyError: 
      self._charset = self.HTTP_STANDARD_ENCODING 
      print('Assuming encoding is %s' % self._charset) 
     return self._charset 

    def header_function(self, header_line): 
     print('hello') 
     header_line = header_line.decode(self.HTTP_STANDARD_ENCODING) 
     if ':' not in header_line: 
      return 
     name, value = header_line.split(':', 1) 
     name = name.strip() 
     value = value.strip() 
     name = name.lower() 
     self.headers[name] = value 


def request(method, url, data=None, params=None, headers=None, 
      stream=False): 
    if stream: 
     return CurlHTTPStream(method, url, data=data, params=params, 
           headers=headers)