2014-04-30 41 views
6

BigQuery是否具有MD5()功能?我知道它有cityhash,但我特别需要MD5。谢谢!是否有可能在BigQuery中使用MD5进行散列?

+0

@dcsohl由于原有系统的支持需求将是非常不错的,MD5的内置功能。我们以前使用过这个MySQL和postgress函数,并且需要一种简单的方法来在BigQuery中支持/​​模拟它。 – vlsergey

回答

2

不,但是bigquery确实有一些sha1-hash支持。该SHA1()函数返回字节,但您可以通过使用TO_BASE64()此转换为base64,这将给你一个不错的字符串或STRING(),这将给你一个丑陋的一个:

SELECT TO_BASE64(SHA1(corpus)) from [publicdata:samples.shakespeare] limit 100; 
+0

有没有支持sha1(),to_base64()等支持函数的状态更新?他们是否期望得到支持?我没有在文档中找到它们。其他什么可用?谢谢! –

+0

我不知道TO_BASE64,SHA1函数有任何公开记录的进度。 但是,这是[问题175](https://code.google.com/p/google-bigquery/issues/detail?id=175#c4),其中指出我们可以预期TO_BASE64和FROM_BASE64函数是记录在不久的将来。没有提到明确的时间表。 我发现没有关于SHA1或任何其他提及它的问题。尽管此评论中提到的所有功能都适用于我。尽管如此,约旦可能会随时改变。 没有提及MD5也不是BigQuery中的已知函数 – Nicholas

0

这里回想起一个古老的线程。现在可以使用用户定义的函数来实现的BigQuery MD5:https://cloud.google.com/bigquery/user-defined-functions

下面是一些示例代码:

function md5cycle(x, k) { 
var a = x[0], b = x[1], c = x[2], d = x[3]; 

a = ff(a, b, c, d, k[0], 7, -680876936); 
d = ff(d, a, b, c, k[1], 12, -389564586); 
c = ff(c, d, a, b, k[2], 17, 606105819); 
b = ff(b, c, d, a, k[3], 22, -1044525330); 
a = ff(a, b, c, d, k[4], 7, -176418897); 
d = ff(d, a, b, c, k[5], 12, 1200080426); 
c = ff(c, d, a, b, k[6], 17, -1473231341); 
b = ff(b, c, d, a, k[7], 22, -45705983); 
a = ff(a, b, c, d, k[8], 7, 1770035416); 
d = ff(d, a, b, c, k[9], 12, -1958414417); 
c = ff(c, d, a, b, k[10], 17, -42063); 
b = ff(b, c, d, a, k[11], 22, -1990404162); 
a = ff(a, b, c, d, k[12], 7, 1804603682); 
d = ff(d, a, b, c, k[13], 12, -40341101); 
c = ff(c, d, a, b, k[14], 17, -1502002290); 
b = ff(b, c, d, a, k[15], 22, 1236535329); 

a = gg(a, b, c, d, k[1], 5, -165796510); 
d = gg(d, a, b, c, k[6], 9, -1069501632); 
c = gg(c, d, a, b, k[11], 14, 643717713); 
b = gg(b, c, d, a, k[0], 20, -373897302); 
a = gg(a, b, c, d, k[5], 5, -701558691); 
d = gg(d, a, b, c, k[10], 9, 38016083); 
c = gg(c, d, a, b, k[15], 14, -660478335); 
b = gg(b, c, d, a, k[4], 20, -405537848); 
a = gg(a, b, c, d, k[9], 5, 568446438); 
d = gg(d, a, b, c, k[14], 9, -1019803690); 
c = gg(c, d, a, b, k[3], 14, -187363961); 
b = gg(b, c, d, a, k[8], 20, 1163531501); 
a = gg(a, b, c, d, k[13], 5, -1444681467); 
d = gg(d, a, b, c, k[2], 9, -51403784); 
c = gg(c, d, a, b, k[7], 14, 1735328473); 
b = gg(b, c, d, a, k[12], 20, -1926607734); 

a = hh(a, b, c, d, k[5], 4, -378558); 
d = hh(d, a, b, c, k[8], 11, -2022574463); 
c = hh(c, d, a, b, k[11], 16, 1839030562); 
b = hh(b, c, d, a, k[14], 23, -35309556); 
a = hh(a, b, c, d, k[1], 4, -1530992060); 
d = hh(d, a, b, c, k[4], 11, 1272893353); 
c = hh(c, d, a, b, k[7], 16, -155497632); 
b = hh(b, c, d, a, k[10], 23, -1094730640); 
a = hh(a, b, c, d, k[13], 4, 681279174); 
d = hh(d, a, b, c, k[0], 11, -358537222); 
c = hh(c, d, a, b, k[3], 16, -722521979); 
b = hh(b, c, d, a, k[6], 23, 76029189); 
a = hh(a, b, c, d, k[9], 4, -640364487); 
d = hh(d, a, b, c, k[12], 11, -421815835); 
c = hh(c, d, a, b, k[15], 16, 530742520); 
b = hh(b, c, d, a, k[2], 23, -995338651); 

a = ii(a, b, c, d, k[0], 6, -198630844); 
d = ii(d, a, b, c, k[7], 10, 1126891415); 
c = ii(c, d, a, b, k[14], 15, -1416354905); 
b = ii(b, c, d, a, k[5], 21, -57434055); 
a = ii(a, b, c, d, k[12], 6, 1700485571); 
d = ii(d, a, b, c, k[3], 10, -1894986606); 
c = ii(c, d, a, b, k[10], 15, -1051523); 
b = ii(b, c, d, a, k[1], 21, -2054922799); 
a = ii(a, b, c, d, k[8], 6, 1873313359); 
d = ii(d, a, b, c, k[15], 10, -30611744); 
c = ii(c, d, a, b, k[6], 15, -1560198380); 
b = ii(b, c, d, a, k[13], 21, 1309151649); 
a = ii(a, b, c, d, k[4], 6, -145523070); 
d = ii(d, a, b, c, k[11], 10, -1120210379); 
c = ii(c, d, a, b, k[2], 15, 718787259); 
b = ii(b, c, d, a, k[9], 21, -343485551); 

x[0] = add32(a, x[0]); 
x[1] = add32(b, x[1]); 
x[2] = add32(c, x[2]); 
x[3] = add32(d, x[3]); 

} 

function cmn(q, a, b, x, s, t) { 
a = add32(add32(a, q), add32(x, t)); 
return add32((a << s) | (a >>> (32 - s)), b); 
} 

function ff(a, b, c, d, x, s, t) { 
return cmn((b & c) | ((~b) & d), a, b, x, s, t); 
} 

function gg(a, b, c, d, x, s, t) { 
return cmn((b & d) | (c & (~d)), a, b, x, s, t); 
} 

function hh(a, b, c, d, x, s, t) { 
return cmn(b^c^d, a, b, x, s, t); 
} 

function ii(a, b, c, d, x, s, t) { 
return cmn(c^(b | (~d)), a, b, x, s, t); 
} 

function md51(s) { 
txt = ''; 
var n = s.length, 
state = [1732584193, -271733879, -1732584194, 271733878], i; 
for (i=64; i<=s.length; i+=64) { 
md5cycle(state, md5blk(s.substring(i-64, i))); 
} 
s = s.substring(i-64); 
var tail = [0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0]; 
for (i=0; i<s.length; i++) 
tail[i>>2] |= s.charCodeAt(i) << ((i%4) << 3); 
tail[i>>2] |= 0x80 << ((i%4) << 3); 
if (i > 55) { 
md5cycle(state, tail); 
for (i=0; i<16; i++) tail[i] = 0; 
} 
tail[14] = n*8; 
md5cycle(state, tail); 
return state; 
} 

/* there needs to be support for Unicode here, 
* unless we pretend that we can redefine the MD-5 
* algorithm for multi-byte characters (perhaps 
* by adding every four 16-bit characters and 
* shortening the sum to 32 bits). Otherwise 
* I suggest performing MD-5 as if every character 
* was two bytes--e.g., 0040 0025 = @%--but then 
* how will an ordinary MD-5 sum be matched? 
* There is no way to standardize text to something 
* like UTF-8 before transformation; speed cost is 
* utterly prohibitive. The JavaScript standard 
* itself needs to look at this: it should start 
* providing access to strings as preformed UTF-8 
* 8-bit unsigned value arrays. 
*/ 
function md5blk(s) { /* I figured global was faster. */ 
var md5blks = [], i; /* Andy King said do it this way. */ 
for (i=0; i<64; i+=4) { 
md5blks[i>>2] = s.charCodeAt(i) 
+ (s.charCodeAt(i+1) << 8) 
+ (s.charCodeAt(i+2) << 16) 
+ (s.charCodeAt(i+3) << 24); 
} 
return md5blks; 
} 

var hex_chr = 'abcdef'.split(''); 

function rhex(n) 
{ 
var s='', j=0; 
for(; j<4; j++) 
s += hex_chr[(n >> (j * 8 + 4)) & 0x0F] 
+ hex_chr[(n >> (j * 8)) & 0x0F]; 
return s; 
} 

function hex(x) { 
for (var i=0; i<x.length; i++) 
x[i] = rhex(x[i]); 
return x.join(''); 
} 

function md5(s) { 
return hex(md51(s)); 
} 

function add32(a, b) { 
return (a + b) & 0xFFFFFFFF; 
} 

var input_columns = ['value']; 

var output_schema = [{name: 'md5', type: 'string'}]; 

bigquery.create_tvf(
    'md5', // The function name exposed to Dremel. 
    input_columns, 
    output_schema, 
    // This function will be invoked once for each input record. 
    function(record, emit) { 
    emit({md5: hex(md51(record.value))}); 
    } 
); 
+0

我们在很大程度上依赖于JS UDF来实现像这样的函数,例如SHA256。问题是 - Google团队积极地将这些查询标记为高计费级别。我希望这将在某些时候或以其他方式解决JS UDF,因为它们是今天 - 在大规模上没有多大用处 –

2

由于这在谷歌搜索“的BigQuery MD5”显示出来,为的情况下,它的值得指出的是BigQuery支援以下散列函数本身在standard SQL

  • MD5
  • SHA1
  • SHA256
  • SHA512
+0

谢谢!这些似乎返回base64而不是十六进制。你知道我可以如何转换它吗? – Bugs

+1

有一个[你可以关注的功能请求](https://issuetracker.google.com/issues/62599093),它也有一个解决方法。这有帮助吗? –

+1

(作为更新,现在有一个'TO_HEX'函数,可以将字节转换为十六进制)。 –

相关问题