0
受到this question的接受答案的启发我试图用类似requests
的界面来包装PyCurl。 Everythig会很好,但在按照PyCURL docs描述如何从头文件读取正文编码后,我遇到了以下问题。每个响应标题都会调用标题回调,但是只有在迭代器开始产生响应行后,才会使编码/字符集检测变得毫无意义。PyCURL正在处理标题之前
下面的代码:
import re
import io
import urllib
import urllib.error
import http
import pycurl
class CurlHTTPStream(object):
SELECT_TIMEOUT = 10
HTTP_STANDARD_ENCODING = 'iso-8859-1'
def __init__(self, method, url, data=None, params=None, headers=None):
self.url = url
self.received_buffer = io.BytesIO()
self.curl = pycurl.Curl()
self.curl.setopt(pycurl.CUSTOMREQUEST, method)
if headers:
self.curl.setopt(
pycurl.HTTPHEADER,
[
'{}: {}'.format(key, value)
for key, value in headers.items()
]
)
if params:
query_string = '&'.join((
'{}={}'.format(key, value)
for key, value in params.items()
))
url = '{}?{}'.format(url, query_string)
self.curl.setopt(pycurl.URL, url)
self.curl.setopt(pycurl.ENCODING, 'gzip')
self.curl.setopt(pycurl.CONNECTTIMEOUT, 5)
self.curl.setopt(pycurl.HEADERFUNCTION, self.header_function)
self.curl.setopt(pycurl.WRITEFUNCTION, self.received_buffer.write)
self.curl_multi = pycurl.CurlMulti()
self.curl_multi.add_handle(self.curl)
self.status_code = 0
self.headers = {}
def _any_data_received(self):
return self.received_buffer.tell() != 0
def _get_received_data(self):
result = self.received_buffer.getvalue()
self.received_buffer.truncate(0)
self.received_buffer.seek(0)
return result
def _check_status_code(self):
if self.status_code == 0:
self.status_code = self.curl.getinfo(pycurl.HTTP_CODE)
if self.status_code != 0 and self.status_code != http.HTTPStatus.OK:
raise urllib.error.HTTPError(
self.url, self.status_code, None, None, None
)
def _perform_on_curl(self):
while True:
ret, num_handles = self.curl_multi.perform()
if ret != pycurl.E_CALL_MULTI_PERFORM:
break
return num_handles
def _iter_chunks(self):
while True:
remaining = self._perform_on_curl()
if self._any_data_received():
self._check_status_code()
yield self._get_received_data()
if remaining == 0:
break
self.curl_multi.select(self.SELECT_TIMEOUT)
self._check_status_code()
self._check_curl_errors()
def _check_curl_errors(self):
for f in self.curl_multi.info_read()[2]:
raise pycurl.error(*f[1:])
def iter_lines(self):
chunks = self._iter_chunks()
return self._split_lines_from_chunks(chunks)
def _split_lines_from_chunks(self, chunks):
print('foo')
print(self.headers)
charset = None
if 'content-type' in self.headers:
content_type = self.headers['content-type'].lower()
match = re.search('charset=(\S+)', content_type)
if match:
charset = match.group(1)
print('Decoding using %s' % charset)
if charset is None:
charset = self.HTTP_STANDARD_ENCODING
print('Assuming encoding is %s' % charset)
pending = None
for chunk in chunks:
if pending is not None:
chunk = pending + chunk
lines = chunk.splitlines()
if lines and lines[-1] and chunk and lines[-1][-1] == chunk[-1]:
pending = lines.pop()
else:
pending = None
for line in lines:
yield line.decode(charset)
if pending is not None:
yield pending.decode(charset)
def header_function(self, header_line):
print('hello')
header_line = header_line.decode(self.HTTP_STANDARD_ENCODING)
if ':' not in header_line:
return
name, value = header_line.split(':', 1)
name = name.strip()
value = value.strip()
name = name.lower()
self.headers[name] = value
def request(method, url, data=None, params=None, headers=None,
stream=False):
if stream:
return CurlHTTPStream(method, url, data=data, params=params,
headers=headers)
这就是在终端发生了什么,当我尝试测试:
Python 3.5.1 (default, Dec 09 2015, 07:29:36) [GCC] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> from pycurl_requests.requests import request
>>> r = request('GET', 'http://my-couchdb-instance:5984/user-30323561366530622d336135622d343637372d386464392d613038653536663865636566/_changes', params={'feed': 'continuous'}, stream=True)
>>> for l in r.iter_lines():
... print(l)
...
foo
{}
Assuming encoding is iso-8859-1
hello
hello
hello
hello
hello
hello
hello
{"seq":1,"id":"account","changes":[{"rev":"1-806053b347406e04d1872e13199fd3cf"}]}
{"seq":4,"id":"identity-bd2c5007-9df3-4ece-9751-843bf5523edd","changes":[{"rev":"1-e3a98ec37776f2cb479b2dcae0266700"}]}
{"seq":5,"id":"section_phone-0342667c-ecbd-401f-acfe-7bb2a1aa3159","changes":[{"rev":"1-457342bc895c7cb6924ceabd07e1ffcf"}]}
有更多的线路从CouchDB的变化来养活,但我截断输出因为它们不相关。
在输出中基本上foo
表示它进入了期望标题的块,但下一行显示self.headers
为空。并且多个hello
代表每次致电header_function()
。在头部回调被触发之前,如何调用将主体写入BytesIO
的写回调?