2010-06-11 42 views
2

不太确定是否可以这样做,但是,我的问题是:我的代码有问题吗?它不会像我想的那样快,而且由于我使用了很多异步工作流程,所以我可能会做错某些事情。这里的目标是建立一些可以在一个小时内抓取20 000页的内容。异步Web抓取F#,出错了?

open System 
    open System.Text 
    open System.Net 
    open System.IO 
    open System.Text.RegularExpressions 
    open System.Collections.Generic 
    open System.ComponentModel 
    open Microsoft.FSharp 
    open System.Threading 
    //This is the Parallel.Fs file 

    type ComparableUri (uri: string) = 

     inherit System.Uri(uri) 

     let elts (uri:System.Uri) = 
      uri.Scheme, uri.Host, uri.Port, uri.Segments 

     interface System.IComparable with 
      member this.CompareTo(uri2) = 
       compare (elts this) (elts(uri2 :?> ComparableUri)) 

     override this.Equals(uri2) = 
      compare this (uri2 :?> ComparableUri) = 0 

     override this.GetHashCode() = 0 


    ///////////////////////////////////////////////Functions to retrieve html string////////////////////////////// 
    let mutable error = Set.empty<ComparableUri> 
    let mutable visited = Set.empty<ComparableUri> 

    let getHtmlPrimitiveAsyncDelay (delay:int) (uri : ComparableUri) = 
     async{ 
       try 
        let req = (WebRequest.Create(uri)) :?> HttpWebRequest 

        // 'use' is equivalent to ‘using’ in C# for an IDisposable 
        req.UserAgent<-"Mozilla" 

        //Console.WriteLine("Waiting") 
        do! Async.Sleep(delay * 250) 
        let! resp = (req.AsyncGetResponse()) 
        Console.WriteLine(uri.AbsoluteUri+" got response after delay "+string delay) 
        use stream = resp.GetResponseStream() 
        use reader = new StreamReader(stream) 
        let html = reader.ReadToEnd() 
        return html 
       with 
       | _ as ex -> Console.WriteLine(ex.ToString()) 
          lock error (fun() -> error<- error.Add uri) 
          lock visited (fun() -> visited<-visited.Add uri) 
          return "BadUri" 
        } 



    ///////////////////////////////////////////////Active Pattern Matching to retreive href////////////////////////////// 

    let (|Matches|_|) (pat:string) (inp:string) = 
     let m = Regex.Matches(inp, pat) 
     // Note the List.tl, since the first group is always the entirety of the matched string. 
     if m.Count > 0 
     then Some (List.tail [ for g in m -> g.Value ]) 
     else None 

    let (|Match|_|) (pat:string) (inp:string) = 
     let m = Regex.Match(inp, pat) 
     // Note the List.tl, since the first group is always the entirety of the matched string. 
     if m.Success then 
      Some (List.tail [ for g in m.Groups -> g.Value ]) 
     else 
      None 
    ///////////////////////////////////////////////Find Bad href////////////////////////////// 

    let isEmail (link:string) = 
     link.Contains("@") 

    let isMailto (link:string) = 
     if Seq.length link >=6 then 
      link.[0..5] = "mailto" 
     else 
      false 

    let isJavascript (link:string) = 
     if Seq.length link >=10 then 
      link.[0..9] = "javascript" 
     else 
      false 

    let isBadUri (link:string) = 
      link="BadUri" 

    let isEmptyHttp (link:string) = 
     link="http://" 

    let isFile (link:string)= 
     if Seq.length link >=6 then 
      link.[0..5] = "file:/" 
     else 
      false 

    let containsPipe (link:string) = 
     link.Contains("|") 


    let isAdLink (link:string) = 
      if Seq.length link >=6 then 
      link.[0..5] = "adlink" 
      elif Seq.length link >=9 then 
      link.[0..8] = "http://adLink" 
      else 
      false 

///////////////////////////////////////////////Find Bad href////////////////////////////// 

    let getHref (htmlString:string) = 

     let urlPat = "href=\"([^\"]+)" 

     match htmlString with 
     | Matches urlPat urls -> urls |> List.map(fun href -> match href with 
                   | Match (urlPat) (link::[]) -> link 
                   | _ -> failwith "The href was not in correct format, there was more than one match") 

     | _ -> Console.WriteLine("No links for this page");[] 
     |> List.filter(fun link -> not(isEmail link)) 
     |> List.filter(fun link -> not(isMailto link)) 
     |> List.filter(fun link -> not(isJavascript link)) 
     |> List.filter(fun link -> not(isBadUri link)) 
     |> List.filter(fun link -> not(isEmptyHttp link)) 
     |> List.filter(fun link -> not(isFile link)) 
     |> List.filter(fun link -> not(containsPipe link)) 
     |> List.filter(fun link -> not(isAdLink link)) 

    let treatAjax (href:System.Uri) = 
     let link = href.ToString() 
     let firstPart = (link.Split([|"#"|],System.StringSplitOptions.None)).[0] 
     new Uri(firstPart) 

    //only follow pages with certain extnsion or ones with no exensions 
    let followHref (href:System.Uri) = 

     let valid2 = set[".py"] 
     let valid3 = set[".php";".htm";".asp"] 
     let valid4 = set[".php3";".php4";".php5";".html";".aspx"] 



     let arrLength = href.Segments |> Array.length 
     let lastExtension = (href.Segments).[arrLength-1] 
     let lengthLastExtension = Seq.length lastExtension 

     if (lengthLastExtension <= 3) then 
      not(lastExtension.Contains(".")) 
     else 
      //test for the 2 case 
      let last4 = lastExtension.[(lengthLastExtension-1)-3..(lengthLastExtension-1)] 

      let isValid2 = valid2|>Seq.exists(fun validEnd -> last4.EndsWith(validEnd)) 

      if isValid2 then 
       true 
      else 
       if lengthLastExtension <= 4 then 
        not(last4.Contains(".")) 
       else 
        let last5 = lastExtension.[(lengthLastExtension-1)-4..(lengthLastExtension-1)] 
        let isValid3 = valid3|>Seq.exists(fun validEnd -> last5.EndsWith(validEnd)) 

        if isValid3 then 
         true 
        else 
         if lengthLastExtension <= 5 then 
          not(last5.Contains(".")) 
         else 
          let last6 = lastExtension.[(lengthLastExtension-1)-5..(lengthLastExtension-1)] 
          let isValid4 = valid4|>Seq.exists(fun validEnd -> last6.EndsWith(validEnd)) 

          if isValid4 then 
           true 
          else 
           not(last6.Contains(".")) && not(lastExtension.[0..5] = "mailto") 




//Create the correct links/-> add the homepage , make then a comparabel Uri 
let hrefLinksToUri (uri:ComparableUri) (hrefLinks:string list) = 
    hrefLinks 
    |> List.map(fun link -> try 
           if Seq.length link <4 then 
            Some(new Uri(uri, link)) 
           else 
            if link.[0..3] = "http" then 
             Some(new Uri(link)) 
            else 
             Some(new Uri(uri, link)) 

          with 
          | _ as ex -> Console.WriteLine(link); 
              lock error (fun() ->error<-error.Add uri) 
              None 
          ) 
    |> List.filter(fun link -> link.IsSome) 
    |> List.map(fun o -> o.Value) 
    |> List.map(fun uri -> new ComparableUri(string uri)) 

//Treat uri , removing ajax last part , and only following links specified b Benoit 
let linksToFollow (hrefUris:ComparableUri list) = 
    hrefUris 
    |>List.map(treatAjax) 
    |>List.filter(fun link -> followHref link) 
    |>List.map(fun uri -> new ComparableUri(string uri)) 
    |>Set.ofList 



let needToVisit uri = 
     (lock visited (fun() -> not(visited.Contains uri))) && (lock error (fun() -> not(error.Contains uri))) 



let getLinksToFollowAsyncDelay (delay:int) (uri: ComparableUri) = 
    //write 
    async{  
       let! links = getHtmlPrimitiveAsyncDelay delay uri 

       lock visited (fun() ->visited<-visited.Add uri) 

       let linksToFollow = getHref links 
            |> hrefLinksToUri uri 
            |> linksToFollow 
            |> Set.filter(needToVisit) 
       return linksToFollow 
       } 

let getDelay(uri:ComparableUri) (authorityDelay:Dictionary<string,System.Diagnostics.Stopwatch >) = 

    let uriAuthority = uri.Authority 
    let hasAuthority,watch = authorityDelay.TryGetValue(uriAuthority) 

    if hasAuthority then 
     let elapsed = watch.Elapsed 
     let s = TimeSpan(0,0,0,0,500)-elapsed 
     if s.TotalMilliseconds < 0.0 then 
      0 
     else 
      int(s.TotalMilliseconds) 

    else 
     let temp = System.Diagnostics.Stopwatch() 
     temp.Start() 
     authorityDelay.Add(uriAuthority,temp) 
     0 




let rec getLinksToFollowFromSetAsync maxIteration (uris: seq<ComparableUri>) = 

    let authorityDelay = Dictionary<string,System.Diagnostics.Stopwatch>() 

    if maxIteration = 100 then 
     Console.WriteLine("Finished") 
    else 
     //Unite by authority add delay for those we same authority others ignore 
     let stopwatch= System.Diagnostics.Stopwatch() 
     stopwatch.Start() 
     let newLinks = uris 
         |> Seq.map( fun uri -> let delay = lock authorityDelay (fun() -> getDelay uri authorityDelay) 
               getLinksToFollowAsyncDelay delay uri) 
         |> Async.Parallel 
         |> Async.RunSynchronously 
         |> Seq.concat 
     stopwatch.Stop() 
     Console.WriteLine("\n\n\n\n\n\n\nTimeElapse : "+string stopwatch.Elapsed+"\n\n\n\n\n\n\n\n\n") 

     getLinksToFollowFromSetAsync (maxIteration+1) newLinks 

seq[set[ComparableUri("http://rue89.com/")]] 
|>PSeq.ofSeq 
|>PSeq.iter(getLinksToFollowFromSetAsync 0) 

    getLinksToFollowFromSetAsync 0 (seq[ComparableUri("http://twitter.com/")]) 

    Console.WriteLine("Finished") 

一些feedBack会很棒!谢谢(请注意,这只是我为乐趣所做的一切)

+0

请格式化很长的代码片段,以便它们适合大约50列,并且不需要右滚动。 – Brian 2010-06-11 16:54:54

回答

3

我认为罪魁祸首是线do! Async.Sleep(delay * 250) - 你会逐渐等待越来越长的时间。这是什么原因?

+0

你不想经常请求到同一台服务器,否则网站管理员会不高兴你正在使用所有的服务器bandwithch(http://stackoverflow.com/questions/3021888/asynchronous-crawling-f)。感谢您查看代码 – jlezard 2010-06-11 16:50:00

+0

同意。如果你立刻开始了所有的事情,那么交错的延迟是有道理的,但是如果你刚刚完成了这一页让你回到相同权威的页面,可能时间已经过去了,但现在你正在等待N + 1秒... – Brian 2010-06-11 16:52:13

+0

我想你想使用'每个权威一个代理'来确保你每秒向该权威发送不超过1个请求(或其他)。但是,您需要代理商或其他人来跟踪“上次您在此发送请求”。现在你只需说“等15秒”即可发送第15个请求,即使第14个请求已在40s前发送。你需要在正确的时间“开始时钟”。 – Brian 2010-06-11 16:53:54