2014-01-26 87 views
0

我正试图抓取一组邮编。 https://www.chase.com/mortgage/loan-officer/search-results.html#action-search;zipcode-11747;lastname-;language-使用Cookie刮掉动态页面

如果你把它放到浏览器中,你会得到结果,但是在代码中尝试这样做会失败。

首先我想

HttpWebRequest request = (HttpWebRequest)System.Net.WebRequest.Create(URI); 
var sr = new System.IO.StreamReader(resp.GetResponseStream()); 
string page= sr.ReadToEnd().Trim(); 

而是通过插件产生的小提琴手没有工作,这个代码以及任。没有结果返回。那么我究竟在想什么?

private void MakeRequests() 
{ 
    HttpWebResponse response; 
    string responseText; 

    if (Request_www_chase_com(out response)) 
    { 
     responseText = ReadResponse(response); 

     response.Close(); 
    } 
} 

private static string ReadResponse(HttpWebResponse response) 
{ 
    using (Stream responseStream = response.GetResponseStream()) 
    { 
     Stream streamToRead = responseStream; 
     if (response.ContentEncoding.ToLower().Contains("gzip")) 
     { 
      streamToRead = new GZipStream(streamToRead, CompressionMode.Decompress); 
     } 
     else if (response.ContentEncoding.ToLower().Contains("deflate")) 
     { 
      streamToRead = new DeflateStream(streamToRead, CompressionMode.Decompress); 
     } 

     using (StreamReader streamReader = new StreamReader(streamToRead, Encoding.UTF8)) 
     { 
      return streamReader.ReadToEnd(); 
     } 
    } 
} 

private bool Request_www_chase_com(out HttpWebResponse response) 
{ 
    response = null; 

    try 
    { 
     HttpWebRequest request = (HttpWebRequest)WebRequest.Create("https://www.chase.com/mortgage/loan-officer/search-results.html"); 

     request.KeepAlive = true; 
     request.Headers.Set(HttpRequestHeader.CacheControl, "max-age=0"); 
     request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"; 
     request.UserAgent = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.76 Safari/537.36"; 
     request.Headers.Add("DNT", @"1"); 
     request.Referer = "https://mail.google.com/mail/u/0/?shva=1"; 
     request.Headers.Set(HttpRequestHeader.AcceptEncoding, "gzip,deflate,sdch"); 
     request.Headers.Set(HttpRequestHeader.AcceptLanguage, "en-US,en;q=0.8"); 
     request.Headers.Set(HttpRequestHeader.Cookie, @"v1st=3B46E5CCD302C2DE; marketlist=68|90|152|170|198; chasezip=zipcode=11577&county=Nassau&state=NY; ASP.NET_SessionId=kwybehscfioasswbl20wb14f; PC_1_0=n%3Dundefined|u%3Dundefined|l%3Dundefined|zip%3D11577|lastUpdate%3D2014-01-24|lastSent%3D2014-01-24|home%3Dpersonal|; SessionPersistence=CLICKSTREAMCLOUD%3A%3DvisitorId%3D%7CPROFILEDATA%3A%3D%7CSURFERINFO%3A%3Dbrowser%3DChrome%2COS%3DWindows%2Cresolution%3D1366x768%7C; fsr.s=%7B%22v2%22%3A-2%2C%22v1%22%3A1%2C%22rid%22%3A%22d464cf6-82273859-c860-572f-2944b%22%2C%22to%22%3A5%2C%22c%22%3A%22https%3A%2F%2Fwww.chase.com%2Fmortgage%2Floan-officer%2Fsearch-results.html%23action-search%3Bzipcode-11747%3Blastname-%3Blanguage-%22%2C%22pv%22%3A12%2C%22lc%22%3A%7B%22d18%22%3A%7B%22v%22%3A12%2C%22s%22%3Atrue%7D%7D%2C%22cd%22%3A18%2C%22sd%22%3A18%2C%22f%22%3A1390649574789%7D"); 
     request.IfModifiedSince = DateTime.Parse("Fri, 24 Jan 2014 20:18:51 GMT"); 

     response = (HttpWebResponse)request.GetResponse(); 
    } 
    catch (WebException e) 
    { 
     if (e.Status == WebExceptionStatus.ProtocolError) response = (HttpWebResponse)e.Response; 
     else return false; 
    } 
    catch (Exception) 
    { 
     if (response != null) response.Close(); 
     return false; 
    } 

    return true; 
} 

回答

1

为了使这项工作,你需要解析HTML,然后下载并运行JavaScript的。不要编写自己的浏览器,而应使用Web浏览器控件加载页面,然后擦除其内部的HTML。

+0

你的解决方案是伟大的,但现在我得到HTTP 501/HTTP 505虽然如果我使用webrequ8est我cou7ld更改http协议版本为1.0或1.1。这似乎是不可能的网络浏览器控制或http://www.eztools-software.com/tools/wow/wow_net.asp – Joe

+0

你已经将上面的评论移到一个新的问题,所以没有意义的是我在这里回复。 – EricLaw

0

该页面使用AJAX来创建结果,这样你会在你的回应看是最初的HTML

+0

但作为浏览器是一个程序,应该有一种方法来获得结果 – Joe

+1

要做到这一点,你需要解析HTML,然后下载并运行JavaScript。不要编写自己的浏览器,而应使用Web浏览器控件加载页面,然后擦除其内部的HTML。 – EricLaw

+0

是的@EricLaw谢谢你。我假设有人已经准备好了可以在没有这种控制的情况下使用的图书馆。我经常是对的,某种灵魂做了这项工作并分享了他的结果。 – Joe