几年前,我需要一个类似的结果,我测试了不同的方式在Teradata中获得连接字符串。顺便说一句,如果行数太高而连接的字符串超过64000个字符,则所有行都可能失败。
最有效的是一个用户定义函数(用C语言编写):
SELECT
PATH
,COUNT(*)
FROM
(
SELECT
DelimitedBuildSorted(MARKETINGCHANNEL
,CAST(CAST(ts AS FORMAT 'yyyymmddhhmiss') AS VARCHAR(14))
,'>') AS PATH
FROM t
GROUP BY id
) AS dt
GROUP BY 1;
如果您需要经常和/或在一个大表,你会跟你的DBA运行查询,如果一个UDF是可能的(大多数DBA不喜欢它们,因为它们是用他们不知道的语言编写的,C)。
如果每个ID的平均行数很低,递归可能是正常的。 Joseph B的版本可能会稍微简化一些,但最重要的是创建一个临时表,而不是使用视图或派生表进行ROW_NUMBER计算。这导致一个更好的计划(在SQL Server中,太):
CREATE VOLATILE TABLE vt AS
(
SELECT
id
,MarketingChannel
,ROW_NUMBER() OVER (PARTITION BY id ORDER BY TS DESC) AS rn
,COUNT(*) OVER (PARTITION BY id) AS max_rn
FROM t
) WITH DATA
PRIMARY INDEX (id)
ON COMMIT PRESERVE ROWS;
WITH RECURSIVE cte(id, path, rn) AS
(
SELECT
id,
-- modify VARCHAR size to fit your maximum number of rows, that's better than VARCHAR(64000)
CAST(MarketingChannel AS VARCHAR(10000)) AS PATH,
rn
FROM vt
WHERE rn = max_rn
UNION ALL
SELECT
cte.ID,
cte.PATH || '>' || vt.MarketingChannel,
cte.rn-1
FROM vt JOIN cte
ON vt.id = cte.id
AND vt.rn = cte.rn - 1
)
SELECT
PATH,
COUNT(*)
FROM cte
WHERE rn = 1
GROUP BY path
ORDER BY PATH
;
你也可以试试老派MAX(CASE):
SELECT
PATH
,COUNT(*)
FROM
(
SELECT
id
,MAX(CASE WHEN rnk = 0 THEN MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 1 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 2 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 3 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 4 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 5 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 6 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 7 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 8 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 9 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 10 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 11 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 12 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 13 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 14 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 15 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 16 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 17 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 18 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 19 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 20 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 21 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 22 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 23 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 24 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 25 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 26 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 27 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 28 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 29 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 30 THEN '>' || MarketingChannel ELSE '' END) ||
MAX(CASE WHEN rnk = 31 THEN '>' || MarketingChannel ELSE '' END) AS PATH
FROM
(
SELECT
id
,TRIM(MarketingChannel) AS MarketingChannel
,RANK() OVER (PARTITION BY id
ORDER BY TS) -1 AS rnk
FROM t
) dt
GROUP BY 1
) AS dt
GROUP BY 1;
我有多达Concat的每30个字符2048行:-)
SELECT
PATH
,COUNT(*)
FROM
(
SELECT
id
,MAX(CASE WHEN rnk MOD 16 = 0 THEN path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 1 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 2 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 3 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 4 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 5 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 6 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 7 THEN '>' || path ELSE '' END) AS PATH
FROM
(
SELECT
id
,rnk/16 AS rnk
,MAX(CASE WHEN rnk MOD 16 = 0 THEN path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 1 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 2 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 3 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 4 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 5 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 6 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 7 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 8 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 9 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 10 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 11 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 12 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 13 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 14 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 15 THEN '>' || path ELSE '' END) AS path
FROM
(
SELECT
id
,rnk/16 AS rnk
,MAX(CASE WHEN rnk MOD 16 = 0 THEN path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 1 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 2 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 3 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 4 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 5 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 6 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 7 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 8 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 9 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 10 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 11 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 12 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 13 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 14 THEN '>' || path ELSE '' END) ||
MAX(CASE WHEN rnk MOD 16 = 15 THEN '>' || path ELSE '' END) AS path
FROM
(
SELECT
id
,TRIM(MarketingChannel) AS PATH
,RANK() OVER (PARTITION BY id
ORDER BY TS) -1 AS rnk
FROM t
) dt
GROUP BY 1,2
) dt
GROUP BY 1,2
) dt
GROUP BY 1
) dt
GROUP BY 1
您如何从营销渠道中获取路径? –
因此,我添加了更多的数据。为了获得营销路径,本质上你只是按时间戳排序。所以,ID#1的路径是SEO> SEO>付费。 ID#2将是付费>付费>会员>付费。 – cloud36
我只是不确定如何汇总这些信息,而这种方式会导致较低的表格。 – cloud36