我用下面sys.dm_fts_parser
的句子分成词。有plenty of TSQL split functions around如果你不在SQL Server 2008上或发现这不适合某种原因。
要求每个A.id
只能与之前没有使用的B.id
配对,反之亦然,我不能想到一个基于高效集的解决方案。
;WITH A(Id, sentence) As
(
SELECT 1,'What other text in here' UNION ALL
SELECT 2,'What am I doing here' UNION ALL
SELECT 3,'I need to find another job' UNION ALL
SELECT 4,'Other text in here'
),
B(Id, sentence) As
(
SELECT 5,'Other text in here' UNION ALL
SELECT 6,'I am doing what here' UNION ALL
SELECT 7,'Purple unicorns' UNION ALL
SELECT 8,'What are you doing in here'
), A_Split
AS (SELECT Id AS A_Id,
display_term,
COUNT(*) OVER (PARTITION BY Id) AS A_Cnt
FROM A
CROSS APPLY
sys.dm_fts_parser('"' + REPLACE(sentence, '"', '""')+'"',1033, 0,0)),
B_Split
AS (SELECT Id AS B_Id,
display_term,
COUNT(*) OVER (PARTITION BY Id) AS B_Cnt
FROM B
CROSS APPLY
sys.dm_fts_parser('"' + REPLACE(sentence, '"', '""')+'"',1033, 0,0)),
Joined
As (SELECT A_Id,
B_Id,
B_Cnt,
Cnt = COUNT(*),
CAST(COUNT(*) as FLOAT)/B_Cnt AS PctMatchBToA,
CAST(COUNT(*) as FLOAT)/A_Cnt AS PctMatchAToB
from A_Split A
JOIN B_Split B
ON A.display_term = B.display_term
GROUP BY A_Id,
B_Id,
B_Cnt,
A_Cnt)
SELECT IDENTITY(int, 1, 1) as id, *
INTO #IntermediateResults
FROM Joined
ORDER BY PctMatchBToA DESC,
PctMatchAToB DESC
DECLARE @A_Id INT,
@B_Id INT,
@Cnt INT
DECLARE @Results TABLE (
A_Id INT,
B_Id INT,
Cnt INT)
SELECT TOP(1) @A_Id = A_Id,
@B_Id = B_Id,
@Cnt = Cnt
FROM #IntermediateResults
ORDER BY id
WHILE (@@ROWCOUNT > 0)
BEGIN
INSERT INTO @Results
SELECT @A_Id,
@B_Id,
@Cnt
DELETE FROM #IntermediateResults
WHERE A_Id = @A_Id
OR B_Id = @B_Id
SELECT TOP(1) @A_Id = A_Id,
@B_Id = B_Id,
@Cnt = Cnt
FROM #IntermediateResults
ORDER BY id
END
DROP TABLE #IntermediateResults
SELECT *
FROM @Results
ORDER BY A_Id
返回
A_Id B_Id Cnt
----------- ----------- -----------
1 8 3
2 6 5
4 5 4
3场比赛6.双方都有单词 “I”。并且1匹配比8更好。它们共享4个字。 – 2011-03-31 15:08:44
你是对的,但我忘了提及在比赛中不应该有重复。一旦给定的行匹配,它就不能再次匹配。你也对5匹配8,但正如我刚才评论你的答案,理想情况下应该考虑到匹配的整体词的百分比。 – 2011-03-31 15:17:48