2013-04-09 82 views
3

我试图将YCbCr-file 从8 bpp转换为10 bpp。python性能处理二进制文件

到目前为止,我的最佳方法仍然比最基本的天真C实现慢 。

C中的朴素方法运行时间约为8s。将代码改为 而不是代码块,将时间降到1秒以下。

我很想知道从标准python处理二进制文件得到 是什么样的性能。示例文件是 ,在CIF-resolution中,与1080p中的内容相比“小”。 随意添加numpy的建议,以及我主要感兴趣 在标准的Python。

测试文件可以从

http://trace.eas.asu.edu/yuv/foreman/foreman_cif.7z 

sha1sum被下载正确的10位输出是

c511dabc793383f7fd0ed69b4bb9b9f89ef73b84 

蟒:

#!/usr/bin/env python 

import array 

f_in = 'foreman_cif.yuv' 
f_out = 'py_10bpp.yuv' 

def bytesfromfile(f): 
    while True: 
     raw = array.array('B') 
     raw.fromstring(f.read(8192)) 
     if not raw: 
      break 
     yield raw 

with open(f_in, 'rb') as fd_in, \ 
     open(f_out, 'wb') as fd_out: 

    for byte in bytesfromfile(fd_in): 
     data = [] 
     for i in byte: 
      i <<= 2 
      data.append(i & 0xff) 
      data.append((i >> 8) & 0xff) 

     fd_out.write(array.array('B', data).tostring()) 

朴素C-DITO:

#include <stdio.h> 
#include <stdlib.h> 

int main(int argc, char** argv) 
{ 
    int c; 
    int d[2]; 

    FILE* fd_in; 
    FILE* fd_out; 

    fd_in = fopen("foreman_cif.yuv", "rb"); 
    fd_out = fopen("c_10bpp.yuv", "wb"); 

    while((c = fgetc(fd_in)) != EOF) { 
     c <<= 2; 
     d[0] = c & 0xff; 
     d[1] = (c >> 8) & 0xff; 

     fwrite(&d[0], 1, 1, fd_out); 
     fwrite(&d[1], 1, 1, fd_out); 
    } 

    fclose(fd_in); 
    fclose(fd_out); 

    return EXIT_SUCCESS; 
} 
+1

FWIW,运行此代码需要我在CPython中处理33秒,在PyPy中处理(未修改)7秒,在C中处理约3秒。 – DSM 2013-04-09 20:07:37

+0

同样的球场:用'-O2'编译时,Cython〜20s和C〜2.5 。首先做了没有电源线到我的笔记本电脑,导致我的核心运行@ 800MHz的测量。 “数量级”来自于此。连接电源线和运行@ 2.2GHz的核心我得到上面的数字 – 2013-04-09 20:33:08

回答

4

从问题的代码把我的机器上25秒,numpy - 0.37秒:

import numpy as np 

a_in = np.memmap('foreman_cif.yuv', mode='readonly') 
a_out = np.memmap('py_10bpp.yuv', mode='write', shape=2*len(a_in)) 
a_out[::2] = a_in << 2 
a_out[1::2] = a_in >> 6 

cython - 0.20秒:

from functools import partial 

import pyximport; pyximport.install() # pip install cython 
from bpp8to10 import convert # bpp8to10.pyx 

f_in = 'foreman_cif.yuv' 
f_out = 'py_10bpp.yuv' 

def main(): 
    with open(f_in, 'rb') as fd_in, open(f_out, 'wb') as fd_out: 
     for chunk in iter(partial(fd_in.read, 8192), b''): 
      fd_out.write(convert(chunk)) 
main() 

其中bpp8to10.pyx

from cpython.bytes cimport PyBytes_FromStringAndSize 

def convert(bytes chunk not None): 
    cdef: 
     bytes data = PyBytes_FromStringAndSize(NULL, len(chunk)*2) 
     char* buf = data # no copy 
     Py_ssize_t j = 0 
     unsigned char c 
    for c in chunk: 
     buf[j] = (c << 2) 
     buf[j + 1] = (c >> 6) 
     j += 2 
    return data 

在纯的CPython版本的主要加速是从模块级的代码移动到一个功能(main()) - 6.7秒(2个CPU):

from functools import partial 
from multiprocessing import Pool 

f_in = 'foreman_cif.yuv' 
f_out = 'py_10bpp.yuv' 

def convert(chunk): 
    data = bytearray() # [] -> bytearray(): 17 -> 15 seconds 
    data_append = data.append # 15 -> 12 seconds 
    for b in bytearray(chunk): # on Python 3: `for b in chunk:` 
     data_append((b << 2) & 0xff) 
     data_append((b >> 8) & 0xff) 
    return data 

def main(): # put in main(): # 25 -> 17 seconds 
    pool = Pool(processes=2) # 12 -> 6.7 seconds 
    with open(f_in, 'rb') as fd_in, open(f_out, 'wb') as fd_out: 
     for data in pool.imap(convert, iter(partial(fd_in.read, 8192), b'')): 
      fd_out.write(data) 
main() 

pypy - 1.6秒:

f_in = 'foreman_cif.yuv' 
f_out = 'py_10bpp.yuv' 

def convert(chunk): 
    data = bytearray() # 1.6 -> 1.5 seconds for preallocated data 
    for b in bytearray(chunk): 
     data.append((b << 2) & 0xff) 
     data.append((b >> 6) & 0xff) 
    return data 

with open(f_in, 'rb') as fd_in, open(f_out, 'wb') as fd_out: 
    while True: 
     chunk = fd_in.read(8192) 
     if not chunk: 
      break 
     fd_out.write(convert(chunk)) 
+0

这看起来很有希望。我印象深刻!谢谢。 – 2013-04-10 07:48:56