2017-07-07 114 views
3

对于100M点数据集中的大查询和超时非常新颖。我试图找到我们在0(停止)附近达到一致的一系列值的点和我们始终高于0(开始点)的点。大查询分析函数提高查询性能

我保存了被加入的子查询,它决定了它自己的数据集的开始文件时间,但这没有帮助。 (秒递增通过多个“文件”。

引起该问题的部分是以前的点和下一个点的初始聚集。

WITH test AS 
(SELECT 'A' as ACM, CAST('2017-01-01' AS DATE) as file_date, CAST('10:10:10' AS TIME) as file_time , 0.0 as value, 0.1 as seconds 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.2 #start 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 2000, 0.3 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 1000, 0.4 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.5 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:10', -1000, 0.6 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:10', -2000, 0.7 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.8 #stop 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.9 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.0 #start 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 1000, 1.1 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 1000, 1.2 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 2000, 1.3 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.4 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.5 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:11', -1000, 1.6 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:11', -2000, 1.7 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.8 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 1000, 1.9 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 2000, 2.0 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 1000, 2.1 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 0, 2.2 #stop 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 20, 2.3 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 0, 2.4 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.1 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.2 #start 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 2000, 0.3 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 1000, 0.4 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.5 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:10', -1000, 0.6 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:10', -2000, 0.7 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.8 #stop 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.9 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.0 #start 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 1000, 1.1 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 1000, 1.2 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 2000, 1.3 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.4 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.5 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:11', -1000, 1.6 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:11', -2000, 1.7 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.8 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 1000, 1.9 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 2000, 2.0 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 1000, 2.1 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 0, 2.2 #stop 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 20, 2.3 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 0, 2.4) 
SELECT 
    acm, 
    file_date, 
    start_file_time, 
    file_times, 
    agg_sec as start_stop 
FROM (
    SELECT 
    acm, 
    file_date, 
    start_file_time, 
    file_times, 
    ARRAY_AGG(kind) OVER w AS agg_kind, 
    ARRAY_AGG(seconds) OVER w AS agg_sec 
    FROM (
    SELECT 
     acm, 
     file_date, 
     start_file_time,  
     ARRAY(SELECT DISTINCT x FROM UNNEST(file_times) as x) AS file_times, 
     seconds, 
     CASE 
     WHEN (ABS(prev_val) < 50 and ABS(next_val) >= 50 and next_avg >= 50 and prev_avg < 50) THEN 'start' 
     WHEN (ABS(next_val) < 50 and ABS(prev_val) >= 50 and prev_avg >= 50 and next_avg < 50) THEN 'stop' 
     END as kind, 
     prev_val, next_val, prev_avg, next_avg 
    FROM (
     SELECT 
     s.acm as acm, 
     s.file_date as file_date, 
     s.start_file_time as start_file_time, 
     seconds, 
     value, 
     ARRAY_AGG(s.file_time) OVER (PARTITION BY s.acm, s.file_date, s.start_file_time) as file_times, 
     AVG(ABS(value)) OVER prev as prev_avg, 
     NTH_VALUE(value, 2) OVER prev as prev_val, 
     AVG(ABS(value)) OVER next as next_avg, 
     NTH_VALUE(value, 2) OVER next as next_val 
     FROM test v 
     JOIN (
     SELECT 
      acm, 
      file_date, 
      file_time, 
      TIME_SUB(file_time, INTERVAL CAST(FLOOR(MIN(seconds)) AS INT64) SECOND) as start_file_time 
     FROM test 
     GROUP BY acm, file_date, file_time 
    ) s ON s.acm = v.acm AND s.file_date = v.file_date AND s.file_time = v.file_time 
     WINDOW prev AS (PARTITION BY s.acm, s.file_date, s.start_file_time ORDER BY seconds ROWS 2 PRECEDING), next AS (PARTITION BY s.acm, s.file_date, s.start_file_time ORDER BY seconds ROWS BETWEEN CURRENT ROW AND 2 FOLLOWING) 
    ) 
    WHERE value = 0) 
    WHERE kind IN ('start', 'stop') 
    WINDOW w AS (PARTITION BY acm, file_date, start_file_time ORDER BY seconds ROWS 1 PRECEDING)) 
WHERE ARRAY_LENGTH(agg_kind) = 2 AND agg_kind[ORDINAL(1)] = 'start' AND agg_kind[ORDINAL(2)] = 'stop' 
; 
+1

请澄清 - 您的文章中的示例代码会生成**正确的结果**,但应用于实际数据时会超时? –

+0

重要的是 - 您可以使用投票下方发布的答案左侧的勾号来标记接受的答案。看到http://meta.stackexchange.com/questions/5234/how-does-accepting-an-answer-work#5235为什么它很重要!对答案投票也很重要。表决有用的答案。 ...当某人回答你的问题时,你可以检查该怎么做 - http://stackoverflow.com/help/someone-answers。遵循这些简单的规则,您可以提高自己的声誉得分,同时让我们有动力回答您的问题:o)请考虑! –

回答

1

检查以下版本将有所作为
我试图保持你原来的代码尽可能

#standardSQL 
WITH test AS 
(SELECT 'A' AS ACM, CAST('2017-01-01' AS DATE) AS file_date, CAST('10:10:10' AS TIME) AS file_time , 0.0 AS value, 0.1 AS seconds 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.2 #start 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 2000, 0.3 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 1000, 0.4 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.5 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:10', -1000, 0.6 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:10', -2000, 0.7 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.8 #stop 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:10', 0, 0.9 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.0 #start 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 1000, 1.1 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 1000, 1.2 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 2000, 1.3 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.4 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.5 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:11', -1000, 1.6 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:11', -2000, 1.7 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 0, 1.8 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:11', 1000, 1.9 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 2000, 2.0 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 1000, 2.1 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 0, 2.2 #stop 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 20, 2.3 
    UNION ALL SELECT 'A', '2017-01-01', '10:10:12', 0, 2.4 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.1 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.2 #start 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 2000, 0.3 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 1000, 0.4 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.5 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:10', -1000, 0.6 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:10', -2000, 0.7 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.8 #stop 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:10', 0, 0.9 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.0 #start 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 1000, 1.1 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 1000, 1.2 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 2000, 1.3 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.4 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.5 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:11', -1000, 1.6 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:11', -2000, 1.7 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 0, 1.8 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:11', 1000, 1.9 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 2000, 2.0 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 1000, 2.1 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 0, 2.2 #stop 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 20, 2.3 
    UNION ALL SELECT 'B', '2017-01-01', '10:10:12', 0, 2.4 
), temp1 AS (
    SELECT acm, file_date, value, seconds, 
    TIME_SUB(file_time, INTERVAL CAST(FLOOR(seconds) AS INT64) SECOND) AS start_file_time 
    FROM test 
), temp2 AS (
    SELECT 
    acm, file_date, start_file_time, seconds, 
    AVG(ABS(value)) OVER prev AS prev_avg, 
    NTH_VALUE(value, 2) OVER prev AS prev_val, 
    AVG(ABS(value)) OVER next AS next_avg, 
    NTH_VALUE(value, 2) OVER next AS next_val 
    FROM temp1 WINDOW 
    prev AS (PARTITION BY acm, file_date, start_file_time ORDER BY seconds ROWS 2 PRECEDING), 
    next AS (PARTITION BY acm, file_date, start_file_time ORDER BY seconds ROWS BETWEEN CURRENT ROW AND 2 FOLLOWING) 
), temp3 AS (
    SELECT 
    acm, file_date, start_file_time, seconds, 
    CASE 
     WHEN (ABS(prev_val) < 50 AND ABS(next_val) >= 50 AND next_avg >= 50 AND prev_avg < 50) THEN 'start' 
     WHEN (ABS(next_val) < 50 AND ABS(prev_val) >= 50 AND prev_avg >= 50 AND next_avg < 50) THEN 'stop' 
    END AS kind 
    FROM temp2 
), temp4 AS (
    SELECT *, 
    COUNTIF(kind = 'start') OVER (PARTITION BY acm, file_date, start_file_time ORDER BY seconds) + 
    COUNTIF(kind = 'stop') OVER (PARTITION BY acm, file_date, start_file_time ORDER BY seconds ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) AS grp 
    FROM temp3 
) 
SELECT 
    acm, file_date, start_file_time, 
    MIN(seconds) AS start_seconds, 
    MAX(seconds) AS stop_seconds 
FROM temp4 
GROUP BY acm, file_date, start_file_time, grp 
HAVING MIN(kind) != MAX(kind) 
-- ORDER BY 1, 2, 3, 4 
1

希望这个查询为您提供您正在寻找的结果,并能够处理你的数据设置成功:

SELECT 
    * EXCEPT(file_data), 
    ARRAY(SELECT STRUCT(seconds, kind) FROM UNNEST(file_data) WHERE kind IS NOT NULL) file_data 
FROM(
    SELECT 
    ACM, 
    file_date, 
    start_file_time, 
    ARRAY(SELECT DISTINCT file_time FROM UNNEST(file_data)) file_times, 
    ARRAY(SELECT STRUCT(seconds, IF(value = 0, (CASE WHEN ABS(NTH_VALUE(value, 2) OVER(prev)) < 50 AND ABS(NTH_VALUE(value, 2) OVER(next)) >= 50 AND AVG(ABS(value)) OVER(next) >= 50 and AVG(ABS(value)) OVER(prev) < 50 THEN 'start' 
                WHEN ABS(NTH_VALUE(value, 2) OVER(next)) < 50 AND ABS(NTH_VALUE(value, 2) OVER(prev)) >= 50 AND AVG(ABS(value)) OVER(prev) >= 50 and AVG(ABS(value)) OVER(next) < 50 THEN 'stop' END), NULL) as kind) 
      FROM UNNEST(file_data) WINDOW prev AS (ORDER BY seconds ROWS 2 PRECEDING), next as(ORDER BY seconds ROWS BETWEEN CURRENT ROW AND 2 FOLLOWING)) file_data 
    FROM(
    SELECT 
     ACM, 
     file_date, 
     TIME_SUB(file_time, INTERVAL CAST(FLOOR(seconds) AS INT64) SECOND) AS start_file_time, 
     ARRAY_AGG(STRUCT(file_time, value, seconds)) file_data 
    FROM test 
    GROUP BY ACM, file_date, start_file_time 
    ) 
) 

其结果是你描述什么是“开始”,并在您test数据“停止”。

的几个注意事项,使:

  • 我避免了昂贵JOIN操作。
  • 尽可能地利用了ARRAY和STRUCTURES,不仅提高了存储效率,而且由于只处理了所需数据,因此不需要处理重复数据,从而提高了查询性能。
  • 只有2 WINDOW条款每个用于对应的ARRAY结构,提高性能。注意这是可能的,因为我将所有内容都聚合到了STRUCT的ARRAY中,所以不需要更复杂的窗口子句,因为数据已经“分类”了。
  • 此查询中没有数据重复。
  • 注意结果现在有一个稍微不同的结构,我建议使用这个新的结构,因为它在数据存储和进一步处理上更有效。

让我知道这是否适合你。