2016-06-13 29 views
2

我对VBA和HTML/XHTML非常陌生,但通过在线研究和其他精彩成员的帮助,我已经设法编写了一个代码来提取我想要的数据。由于XHTML是我想要的元素,所以我很难识别这些元素的ID,所以我认为这是我最糟糕的地方。XHTML网站掠夺指导

网站:http://www.usbanklocations.com/banks.php?q=&ct=&ml=30&lc=

这是我想要的代码做: 拉银行名称,地址,电话号码,总存款和总资产 - 定的银行名称和城市我在我的Excel表中提供。

这里是我的代码:

Private Declare Sub Sleep Lib "kernel32.dll" (ByVal dwMilliseconds As Long) 
Sub CommunityBanks() 
    Dim IE As Object, TableResults As Object, webRow As Object, BankName As Variant, page As Long, pageTotal As Long, r As Long 
    Dim beginTime As Date, i As Long, myvalue As Variant 

Set IE = CreateObject("internetexplorer.application") 
IE.navigate "http://www.usbanklocations.com/banks.php?name=" & Range("A2").Value & "+Bank&ml=30&lc=" & Range("B2").Value & "%2C+TX" 
IE.Visible = True 

Do While IE.Busy Or IE.readystate <> 4 '4 = READYSTATE_COMPLETE 
    DoEvents 
Loop 

'input bank name into form 
'myvalue = InputBox("Enter City. Press okay to begin search", "Bank Search") 
'Range("F3").Value = myvalue 
'IE.document.getelementbyid("MainContent_txtCity").Value = "LegacyTexas" 
'click find button 
'IE.document.getelementbyid("MainContent_btn").Click 
'Sleep 5 * 1000 
IE.document.getelementbytagname("table").getelementsbyclassname("btn").Click 
Sleep 5 * 1000 

'total pages 
pageTotal = IE.document.getelementbyid("lsortby").innertext 
page = 0 

Do Until page = pageTotal 
    DoEvents 
    page = IE.document.getelementbyclassname("lsortby").innertext 
    With IE.document.getelementbyid("main") 
     For r = 1 To .Rows.Length - 1 
      If Not IsArray(BankName) Then 
       ReDim BankName(7, 0) As Variant 
      Else 
       ReDim Preserve BankName(7, UBound(BankName, 2) + 1) As Variant 
      End If 

      BankName(0, UBound(BankName, 2)) = .Rows(r).Cells(0).innertext 
     Next r 
    End With 

    If page < pageTotal Then 
     IE.document.getelementbyclassname("panelpn").Click 
     beginTime = Now 
     Application.Wait (Now + TimeValue("00:00:05")) 
    End If 
Loop 

For r = 0 To UBound(BankName, 2) 
    IE.navigate "http://www.usbanklocations.com/" & BankName(0, r) 
    Do While IE.Busy Or IE.readystate <> 4 '4 = READYSTATE_COMPLETE 
     DoEvents 
    Loop 
    'wait 5 sec. for screen refresh 
    Sleep 5 * 1000 

    With IE.document.getelementbytagname("table") 
     For i = 0 To .Rows.Length - 1 
      DoEvents 
      Select Case .Rows(i).Cells(0).innertext 
      Case "Name:" 
       BankName(1, r) = .Rows(i).Cells(1).innertext 
      Case "Location:" 
       BankName(2, r) = .Rows(i).Cells(1).innertext 
      Case "Phone:" 
       BankName(3, r) = .Rows(i).Cells(1).innertext 
      Case "Branch Deposit:" 
       BankName(4, r) = Replace(Replace(.Rows(i).Cells(1).innertext, ",", ""), "$", "") 
      Case "Total Assets:" 
       BankName(5, r) = Replace(Replace(.Rows(i).Cells(1).innertext, ",", ""), "$", "") 
      End Select 
     Next i 
    End With 
Next r 


IE.Quit 
Set IE = Nothing 

'post result on Excel cell 
Worksheets(1).Range("A9").Resize(UBound(BankName, 2) + 1, UBound(BankName, 1) + 1).Value = Application.Transpose(BankName) 
End Sub 

预先感谢您!我将不胜感激任何帮助。

+1

The [ToS for usbanklocations.com](http://www.usbanklocations.com/terms-of-use.php)指出用户不能'聚集,复制或复制USBANKLOCATIONS.COM上的内容 - so我敢肯定,你不应该刮他们的网站...... –

+0

由“在”,他们指的是具体到他们的网站的行动。不是用户可以使用的内容。您可以复制/粘贴信息。 –

+0

好的 - 我通常不会参与刮胡子的问题,只是为了谨慎。我只是指出,如果你不知道,但如果你很高兴,这是好的,那么足够公平。 –

回答

2

考虑它使用XHR而不是IE和基于分HTML内容解析下面的例子:

Option Explicit 

Sub Test_usbanklocations() 

    Dim oSource, oDestination, y, oSrcRow, sName, sCity, sDist, sUrl0, sUrl1, sUrl2, lPage, sResp1, sResp2, i, a1, a2, a3, a4, a5 

    Set oSource = Sheets(1) 
    Set oDestination = Sheets(2) 
    oDestination.Cells.Delete 
    DataOutput oDestination, 1, Array("Name", "Location", "Phone", "Total Assets", "Total Deposits") 
    y = 2 

    For Each oSrcRow In oSource.UsedRange.Rows 
     sName = oSrcRow.Cells(1, 1).Value 
     sCity = oSrcRow.Cells(1, 2).Value 
     sDist = oSrcRow.Cells(1, 3).Value 
     sUrl0 = "http://www.usbanklocations.com/banks.php?q=" & EncodeUriComponent(sName) & "&lc=" & EncodeUriComponent(sCity) & "&ml=" & sDist 
     sUrl1 = sUrl0 
     lPage = 1 
     Do 
      sResp1 = GetXHR(sUrl1) 
      If InStr(sResp1, "We can not find the address you provided. Please check.") > 0 Then Exit Do 
      a1 = Split(sResp1, "<div class=""pl") 
      For i = 1 To UBound(a1) 
       a2 = Split(a1(i), "</div>", 3) 
       a3 = Split(a2(1), "<a href=""", 2) 
       a4 = Split(a3(1), """>", 2) 
       sUrl2 = "http://www.usbanklocations.com" & a4(0) 
       sResp2 = GetXHR(sUrl2) 
       a5 = Array(_ 
        GetFragment(sResp2, "<b>Name:</b></td><td>", "</td>"), _ 
        Replace(GetFragment(sResp2, "<b>Location:</b></td><td>", "</td>"), "View Other Branches", ""), _ 
        GetFragment(sResp2, "<b>Phone:</b></td>", "</td>"), _ 
        GetFragment(sResp2, "<b>Total Assets:</b></td><td>", "</td>"), _ 
        GetFragment(sResp2, "<b>Total Deposits:</b></td><td>", "</td>") _ 
       ) 
       DataOutput oDestination, y, a5 
       y = y + 1 
       DoEvents 
      Next 
      If InStr(sResp1, "Next Page &gt;") = 0 Then Exit Do 
      lPage = lPage + 1 
      sUrl1 = sUrl0 & "&ps=" & lPage 
      DoEvents 
     Loop 
    Next 

    MsgBox "Completed" 

End Sub 

Function GetXHR(sUrl) 

    With CreateObject("MSXML2.XMLHTTP") 
     .Open "GET", sUrl, False 
     .Send 
     GetXHR = .ResponseText 
    End With 

End Function 

Sub DataOutput(oSht, y, aValues) 

    With oSht.Cells(y, 1).Resize(1, UBound(aValues) + 1) 
     .NumberFormat = "@" 
     .Value = aValues 
    End With 

End Sub 

Function GetFragment(sText, sPatt1, sPatt2) 

    Dim a1, a2 

    a1 = Split(sText, sPatt1, 2) 
    If UBound(a1) <> 1 Then Exit Function 
    a2 = Split(a1(1), sPatt2, 2) 
    If UBound(a2) <> 1 Then Exit Function 
    GetFragment = GetInnerText(a2(0)) 

End Function 

Function EncodeUriComponent(sText) 

    Static objHtmlfile As Object 

    If objHtmlfile Is Nothing Then 
     Set objHtmlfile = CreateObject("htmlfile") 
     objHtmlfile.parentWindow.execScript "function encode(s) {return encodeURIComponent(s)}", "jscript" 
    End If 
    EncodeUriComponent = objHtmlfile.parentWindow.encode(sText) 

End Function 

Function GetInnerText(sText) 

    With CreateObject("htmlfile") 
     .Write ("<body>" & sText & "</body>") 
     GetInnerText = .DocumentElement.Document.GetElementsByTagName("body")(0).InnerText 
    End With 

End Function 

作为一个例子,第一个工作表中包含的数据搜索(银行名称,位置和距离的细化):

source

然后,第二工作表上结果如下:

result

+0

你真棒@墨守成规!这个XHR/api方法是一个很好的基础。非常感谢你。实际上,我只是熟悉XHR,这将是我的第一个以这种格式来看待的代码。 我注意到大型数据集的速度要快得多。非常感谢。 –

+0

@ K.K。顺便说一句,使XHR异步,你可以达到更高的速度,但代码应该与事件一起工作。 – omegastripes

+0

@omegastripes,谢谢你的代码。对我来说这是一项全新的技能。我从中学到了东西。 – PaichengWu