2016-03-09 83 views
2

我从来没有真正用过VBA或任何其他类型的代码,除了几年前在VB中做过的小事情。这是我尝试编写一些代码,用于搜索客户帐户的Excel表数据库并搜索可能的重复帐户。不幸的是,在我需要运行这台计算机的机器上,它只能处理大约3,500个条目而不会崩溃Excel。我对这两个代码都做出了巨大的贡献,就像机器运行缓慢一样。优化VBA/Excel宏代码(在大表中查找重复项)

可以做些什么来优化下面的代码,以及我将来应该使用哪些VBA中的最佳实践?

'Essentially, this loops through each row in the sheet 
'For each row, it loops through every row after it, searching for duplicates of itself (skipping over a rows that have previously been marked as duplicates) 
'Duplicates are defined by entries that meet a 'threshhold' of similarity 
'The threshhold is defined as the number '5', first and last names are each two points, address and email address are one point 
'That means that in order for an entry to meet the thresshold, the first and last name must be the same, and it must also have either the same address or email 
'When duplicates are found, the duplicate column is marked as 'Yes' for that row, and the first occurence column is marked with a number defining the row number where the account first appeared 

Sub Main(): 
    Dim lNameCol, fNameCol, addressCol, emailCol, duplicateCol, fOccurenceCol As String 

    'Defines the column letters for the various data fields 
    lNameCol = "A" 
    fNameCol = "B" 
    addressCol = "C" 
    emailCol = "D" 
    duplicateCol = "E" 'The column where a entry/row will be marked as being a duplicate 
    fOccurenceCol = "F" 'The column that contains the row number where a duplicate accounts first occurence was found 

    Call Duplicates(lNameCol, fNameCol, addressCol, emailCol, duplicateCol, fOccurenceCol) 
End Sub 

'Gets number of rows in currently active sheet 
Function RowCount(): 
    Application.ActiveSheet.UsedRange 
    RowCount = Worksheets("Sheet1").UsedRange.Rows.Count 
End Function 

'Finds and labels duplicates 
Sub Duplicates(ByVal lNameCol As String, ByVal fNameCol As String, ByVal addressCol As String, ByVal emailCol As String, ByVal duplicateCol As String, ByVal fOccurenceCol As String) 
    Dim lRowCount As Integer 
    lRowCount = RowCount() 

    'Loops through each row in the sheet 
    For i = 1 To lRowCount 

     Dim duplicate, lastName, firstName, email, address As String 

     'Sets these variables' values corresponding cell value in row 'i' 
     'UCase capitilizes things to make entries case-insensitive 
     duplicate = UCase(Range(duplicateCol & i).Value) 
     lastName = UCase(Range(lNameCol & i).Value) 
     firstName = UCase(Range(fNameCol & i).Value) 
     email = UCase(Range(emailCol & i).Value) 
     address = UCase(Range(addressCol & i).Value) 

     'Checks to make sure row has not already been marked a duplicate, if it hasn't it continues 
     If (StrComp(duplicate = "YES", vbTextCompare) = 1) Then 

      'Loops through every row after the current row (row 'i') 
      For n = (i + 1) To lRowCount 

       'duplicateThreshold is an integer that defines the threshhold of similarity that rows need to have in order to be labeled a duplicate 
       Dim duplicateThreshhold As Integer 
       Dim lastName2, firstName2, email2, address2 As String 

       duplicateThreshhold = 0 

       'These are the entry variables for account entry at row 'n' being compared to the account entry at row 'i' 
       lastName2 = UCase(Range(lNameCol & n).Value) 
       firstName2 = UCase(Range(fNameCol & n).Value) 
       email2 = UCase(Range(emailCol & n).Value) 
       address2 = UCase(Range(addressCol & n).Value) 

       'Adds 2 points to threshhold if first name is the same 
       If lastName = lastName2 Then 
        duplicateThreshhold = duplicateThreshhold + 2 

       End If 

       'Adds 2 points to threshold if last name is the same 
       If firstName = firstName2 Then 
        duplicateThreshhold = duplicateThreshhold + 2 
       End If 

       'The remaining two fields give 1 point each to the thresshold 
       'As long as the sum of the points given by first and last name is always greater than half of the threshhold, first and last name will always be required 

       If email = email2 Or address = address2 Then 
        duplicateThreshhold = duplicateThreshhold + 1 
       End If 

       If duplicateThreshhold > 4 Then 
        'Labels duplicate entries as duplicates 
        Range(duplicateCol & i).Value = "Yes" 
        Range(duplicateCol & n).Value = "Yes" 

        'Labels duplicate entries with the first occurence of that entry 
        Range(fOccurenceCol & i).Value = i 'Labels first occurence account's row number 
        Range(fOccurenceCol & n).Value = i 

       End If 

      Next 
     End If 
    Next 


End Sub 
+0

为什么不用去重功能在Excel中,而不是使用VBA?在Excel 2013中:突出显示您的专栏,导航至* Data-> Remove Duplicates * –

+0

我在[本网站]上找到了提示(https://blogs.office.com/2009/03/12/excel-vba-性能编码最佳实践/)有助于让我的代码更快运行,好的第一步可能是看看那里。 – eirikdaude

+0

好吧,这听起来像是你正试图建立你自己的'模糊逻辑'的味道(MS已经完成得很好)。看看[设置自动筛选多通配符](http://stackoverflow.com/questions/16602872/),[高级筛选条件可以在VBA而不是范围?](http://stackoverflow.com/问题/ 34532282 /)和[自动筛选器是否可以从Dictionary键中同时采用包容性和非包含性通配符?](http://stackoverflow.com/questions/34614417)。 – Jeeped

回答

2

好了,这是那停留在我的后脑勺这些问题,所以我必须解决它(非常感谢@RJGordon!)。我最终以两种不同的方式来解决它 - 第一种是嵌套循环,第二种是散列字典。第二个是更清晰和更快的算法,但为了彻底,我将介绍这两个算法。

嵌套循环

由于@JohnColeman指出,这种方法是有道理的逻辑,但可怕的缩放。为每条记录提供所有重复行的列表非常简单,并且具有标记数据集中第一行的优点。 (第二种解决方案下面没有标签与下面的重复最初的记录,但如果需要的话,你可以解决这一点。)

Option Explicit 

Sub test() 
    MarkDuplicates ActiveSheet, 1, 2, 3, 4, 5, 6 
End Sub 

Sub MarkDuplicates(sh As Worksheet, lNameCol As Long, _ 
        fNameCol As Long, addressCol As Long, _ 
        emailCol As Long, duplicateCol As Long, _ 
        fOccuranceCol As Long) 
    Dim lastRow As Long 
    Dim lastCol As Long 
    Dim acctRange As Range 
    Dim acctData As Variant 
    Dim checkRow As Long 
    Dim otherRow As Long 
    Dim dupScore As Integer 
    Dim dupList As String 

    '--- determine the range of data and copy to a memory-based array 
    lastRow = sh.Cells(sh.Rows.Count, "A").End(xlUp).Row 
    lastCol = sh.Cells(1, sh.Columns.Count).End(xlToLeft).Column 
    Set acctRange = sh.Range("A1").Resize(lastRow, lastCol) 
    acctData = acctRange 

    '--- nested loop to check each row against every other row 
    For checkRow = 2 To lastRow 
     dupList = "" 
     For otherRow = 2 To lastRow 
      dupScore = 0 
      If otherRow <> checkRow Then 
       If acctData(checkRow, lNameCol) = acctData(otherRow, lNameCol) Then 
        dupScore = dupScore + 2 
       End If 
       If acctData(checkRow, fNameCol) = acctData(otherRow, fNameCol) Then 
        dupScore = dupScore + 2 
       End If 
       If acctData(checkRow, addressCol) = acctData(otherRow, addressCol) Then 
        dupScore = dupScore + 1 
       End If 
       If acctData(checkRow, emailCol) = acctData(otherRow, emailCol) Then 
        dupScore = dupScore + 1 
       End If 
       If dupScore > 4 Then 
        dupList = dupList & otherRow & "," 
       End If 
      End If 
     Next otherRow 
     If Len(dupList) > 0 Then 
      dupList = Left(dupList, Len(dupList) - 1) 
      acctData(checkRow, duplicateCol) = "Yes" 
      acctData(checkRow, fOccuranceCol) = dupList 
     Else 
      acctData(checkRow, duplicateCol) = "" 
      acctData(checkRow, fOccuranceCol) = "" 
     End If 
    Next checkRow 

    '--- copy the array back to the worksheet 
    acctRange = acctData 

    Set sh = Nothing 
End Sub 

使用字典

而且我的意思字典(复数)。由于您的重复分数阈值可以通过三种不同的字段组合来达到,因此您的字典哈希必须测试每种组合。我选择的字典关键字(散列)是字段的连接字符串,在测试时会指示重复的记录。此解决方案仅显示带有三个字典的单个循环。如果您想要找到找到的所有重复记录的列表,则重写代码以在单个循环中创建所有三个词典,然后针对每个词典键使用单独的(未嵌套的)循环来记录每个记录,并保留一个正在运行的傻瓜列表。 (为了提高效率,我将它保留在一个循环中)。

使用较长的键(例如lastName + firstName + address + email)创建单个字典会导致您的关键冲突具有重复的所有这些字段,但你仍然必须找到一种方法来测试其他组合。比我更聪明的人可能会想出一个更简单的方法。

Option Explicit 

Sub test() 
    MarkDuplicates ActiveSheet, 1, 2, 3, 4, 5, 6 
End Sub 

Sub MarkDuplicates(sh As Worksheet, lNameCol As Long, _ 
        fNameCol As Long, addressCol As Long, _ 
        emailCol As Long, duplicateCol As Long, _ 
        fOccuranceCol As Long) 
    Dim lastRow As Long 
    Dim lastCol As Long 
    Dim acctRange As Range 
    Dim acctData As Variant 
    Dim acctDict1 As Dictionary 
    Dim acctDict2 As Dictionary 
    Dim acctDict3 As Dictionary 
    Dim acctKey As String 
    Dim checkRow As Long 
    Dim otherRow As Long 
    Dim dupScore As Integer 
    Dim dupList As String 

    '--- determine the range of data and copy to a memory-based array 
    lastRow = sh.Cells(sh.Rows.Count, "A").End(xlUp).Row 
    lastCol = sh.Cells(1, sh.Columns.Count).End(xlToLeft).Column 
    Set acctRange = sh.Range("A1").Resize(lastRow, lastCol) 
    acctData = acctRange 

    Set acctDict1 = New Dictionary 
    Set acctDict2 = New Dictionary 
    Set acctDict3 = New Dictionary 

    '--- build the initial dictionary 
    ' for the key to trip as duplicate, there are three possible 
    ' combinations to check, so we make three dictionaries and 
    ' create keys as combinations of the fields 
    For checkRow = 2 To lastRow 
     '--- clear previous flags 
     acctData(checkRow, duplicateCol) = "" 
     acctData(checkRow, fOccuranceCol) = "" 

     '--- dupe is lastname + firstname 
     acctKey = acctData(checkRow, lNameCol) & acctData(checkRow, fNameCol) 
     If Not acctDict1.Exists(acctKey) Then 
      acctDict1.Add acctKey, checkRow 
     ElseIf acctData(checkRow, duplicateCol) <> "Yes" Then 
      acctData(checkRow, duplicateCol) = "Yes1" 
      acctData(checkRow, fOccuranceCol) = acctDict1.Item(acctKey) 
     End If 

     '--- dupe is lastname + address + email 
     acctKey = acctData(checkRow, lNameCol) & acctData(checkRow, addressCol) & _ 
        acctData(checkRow, emailCol) 
     If Not acctDict2.Exists(acctKey) Then 
      acctDict2.Add acctKey, checkRow 
     ElseIf acctData(checkRow, duplicateCol) <> "Yes" Then 
      acctData(checkRow, duplicateCol) = "Yes2" 
      acctData(checkRow, fOccuranceCol) = acctDict2.Item(acctKey) 
     End If 

     '--- dupe is firstname + address + email 
     acctKey = acctData(checkRow, fNameCol) & acctData(checkRow, addressCol) & _ 
        acctData(checkRow, emailCol) 
     If Not acctDict3.Exists(acctKey) Then 
      acctDict3.Add acctKey, checkRow 
     ElseIf acctData(checkRow, duplicateCol) <> "Yes" Then 
      acctData(checkRow, duplicateCol) = "Yes3" 
      acctData(checkRow, fOccuranceCol) = acctDict3.Item(acctKey) 
     End If 
    Next checkRow 

    '--- copy the array back to the worksheet 
    acctRange = acctData 

    Set sh = Nothing 
End Sub