2016-08-14 55 views
2

我正在使用此脚本来获取带有Microsoft Excel的网页的文本数据,但是,它仅返回文本,但我想要获取分隔列中的超链接。你可以帮我吗? 看来该命令只返回文本数据,但我正在寻找将文本和对应的URL保存为文本(当然不是超链接!)。我的评论 https://msdn.microsoft.com/en-us/library/office/ff836520.aspx但我找不到任何东西。在Excel中获取带有超链接和VB表格的网页表

您可以在代码中看到提供了网址的网页。

Sub SaveUrl() 
    Set shFirstQtr = Workbooks(1).Worksheets(1) 
    Set qtQtrResults = shFirstQtr.QueryTables _ 
         .Add(Connection:="URL;http://www.tsetmc.com/Loader.aspx?ParTree=111C1417", _ 
          Destination:=shFirstQtr.Cells(1, 1)) 
    With qtQtrResults 
     .WebFormatting = xlNone 
     .WebSelectionType = xlSpecifiedTables 
     .WebTables = "1" 
     .Refresh 
    End With 
End Sub 

回答

1

下面是表示如何自动IE和从DOM(运行TestIE())检索必要的数据,并且使请求与XHR和解析响应与正则表达式的例子中(运行TestXHR()):

Option Explicit 

' The code to automate IE and retrieve the necessary data from DOM 

Sub TestIE() 

    Dim aText() As Variant 
    Dim aHref() As Variant 
    Dim aHrefExists() As Boolean 
    Dim aRes() As Variant 
    Dim lRowsCount As Long 
    Dim lCellsCount As Long 
    Dim i As Long 
    Dim j As Long 
    Dim lCellsTotal As Long 
    Dim x As Long 

    With CreateObject("InternetExplorer.Application") 
     ' Make visible for debug 
     .Visible = True 
     ' Navigate to page 
     .Navigate "http://www.tsetmc.com/Loader.aspx?ParTree=111C1417" 
     ' Wait for IE ready 
     Do While .ReadyState <> 4 Or .Busy 
      DoEvents 
     Loop 
     ' Wait for document complete 
     Do While .Document.ReadyState <> "complete" 
      DoEvents 
     Loop 
     ' Wait for target table accessible 
     Do While TypeName(.Document.getElementById("tblToGrid")) = "Null" 
      DoEvents 
     Loop 
     ' Process target table 
     With .Document.getElementById("tblToGrid") 
      ' Get table size 
      lRowsCount = .Rows.Length 
      lCellsCount = .Rows(0).Cells.Length 
      ' Create 2d arrays for texts and hyperlinks values, and for column url existance flag 
      ReDim aText(1 To lRowsCount, 1 To lCellsCount) 
      ReDim aHref(1 To lRowsCount, 1 To lCellsCount) 
      ReDim aHrefExists(1 To lCellsCount) 
      ' Process each table row 
      For i = 1 To lRowsCount 
       With .Rows(i - 1) 
        ' Process each cell 
        For j = 1 To lCellsCount 
         ' Retrieve text content 
         aText(i, j) = .Cells(j - 1).innerText 
         ' Retrieve hyperlink if exists 
         With .Cells(j - 1).getElementsByTagName("a") 
          If .Length = 1 Then 
           aHrefExists(j) = True 
           aHref(i, j) = .Item(0).href 
          End If 
         End With 
        Next 
       End With 
      Next 
     End With 
     .Quit 
    End With 
    ' Create resulting array that includes texts and urls 
    lCellsTotal = lCellsCount 
    For j = 1 To lCellsCount 
     If aHrefExists(j) Then lCellsTotal = lCellsTotal + 1 
    Next 
    ReDim aRes(1 To lRowsCount, 1 To lCellsTotal) 
    ' Populate array with texts and urls 
    x = 1 
    For j = 1 To lCellsCount 
     For i = 1 To lRowsCount 
      aRes(i, x) = aText(i, j) 
     Next 
     x = x + 1 
     If aHrefExists(j) Then 
      For i = 1 To lRowsCount 
       aRes(i, x) = aHref(i, j) 
      Next 
      x = x + 1 
     End If 
    Next 
    ' Result output to sheet 1 
    With Sheets(1) 
     .Cells.Delete 
     Output .Cells(1, 1), aRes 
    End With 
End Sub 

' The code to make request with XHR and parse response with RegEx 

Sub TestXHR() 

    Dim sRespText As String 
    Dim oRERows As Object 
    Dim oRECells As Object 
    Dim aRes() As Variant 
    Dim lRowsCount As Long 
    Dim lCellsCount As Long 
    Dim i As Long 
    Dim j As Long 
    Dim lCellsTotal As Long 
    Dim x As Long 

    ' Retrieve HTML content 
    With CreateObject("MSXML2.XMLHttp") 
     .Open "GET", "http://www.tsetmc.com/Loader.aspx?ParTree=111C1417", False 
     .Send 
     sRespText = .responseText 
    End With 
    ' Regular expression for table rows setup 
    Set oRERows = CreateObject("VBScript.RegExp") 
    With oRERows 
     .Global = True 
     .MultiLine = True 
     .IgnoreCase = True 
     .Pattern = "<tr.*?>[\s\S]*?</tr>" 
    End With 
    ' Regular expression for table cells setup 
    Set oRECells = CreateObject("VBScript.RegExp") 
    With oRECells 
     .Global = True 
     .MultiLine = True 
     .IgnoreCase = True 
     .Pattern = "<td.*?>(?:.*?<a.*?href=(""|')(.*?)\1.*?>(.*?)</a>.*?|(.*?))</td>" 
    End With 
    ' Execute 1st regexp on response 
    With oRERows.Execute(sRespText) 
     ' Get table size 
     lRowsCount = .Count 
     lCellsCount = oRECells.Execute(.Item(0).Value).Count 
     ' Create 2d arrays for texts and hyperlinks values, and for column url existance flag 
     ReDim aText(1 To lRowsCount, 1 To lCellsCount) 
     ReDim aHref(1 To lRowsCount, 1 To lCellsCount) 
     ReDim aHrefExists(1 To lCellsCount) 
     ' Process each table row 
     For i = 1 To lRowsCount 
      ' Get 1st regexp match value, and execute 2nd regexp on it 
      With oRECells.Execute(.Item(i - 1).Value) 
      ' Process each cell 
      For j = 1 To .Count 
       With .Item(j - 1) 
        If .SubMatches(3) <> "" Then 
         ' Retrieve text content only 
         aText(i, j) = .SubMatches(3) 
        Else 
         ' Retrieve text content and hyperlink 
         aText(i, j) = .SubMatches(2) 
         aHref(i, j) = "http://www.tsetmc.com/" & .SubMatches(1) 
         aHrefExists(j) = True 
        End If 
       End With 
      Next 
      End With 
     Next 
    End With 
    ' Create resulting array that includes texts and urls 
    lCellsTotal = lCellsCount 
    For j = 1 To lCellsCount 
     If aHrefExists(j) Then lCellsTotal = lCellsTotal + 1 
    Next 
    ReDim aRes(1 To lRowsCount, 1 To lCellsTotal) 
    ' Populate array with texts and urls 
    x = 1 
    For j = 1 To lCellsCount 
     For i = 1 To lRowsCount 
      aRes(i, x) = aText(i, j) 
     Next 
     x = x + 1 
     If aHrefExists(j) Then 
      For i = 1 To lRowsCount 
       aRes(i, x) = aHref(i, j) 
      Next 
      x = x + 1 
     End If 
    Next 
    ' Result output to sheet 2 
    With Sheets(2) 
     .Cells.Delete 
     Output .Cells(1, 1), aRes 
    End With 

End Sub 

' Utility section 

Sub Output(objDstRng As Range, arrCells As Variant) 
    With objDstRng 
     .Parent.Select 
     With .Resize(_ 
       UBound(arrCells, 1) - LBound(arrCells, 1) + 1, _ 
       UBound(arrCells, 2) - LBound(arrCells, 2) + 1) 
      .NumberFormat = "@" 
      .Value = arrCells 
      .Columns.AutoFit 
     End With 
    End With 
End Sub 

两种方法都给出了相同的结果(在表1和2):

result

+0

非常感谢你,它的工作!但我仍然在理解它。 :) –

相关问题