2016-09-12 62 views
0

我试图在我的抓取工具中实现策略模式,我认为使用不同的策略来抓取不同的网站会很好。所以我想page.evaluate内的内容取决于当前正在运行的网站而有所不同。 page.evaluate中的注释代码有效,但有没有办法可以将它提取到函数中?我试图运行this.findJobs()没有成功。在phantomjs中使用策略模式

"use strict"; 

var Crawler = function() { 
    this.page = require('webpage').create(); 
    this.website = ""; 
    this.jobs_list = []; 

}; 

Crawler.prototype.setStrategy = function(company) { 
    this.website = company; 
}; 

Crawler.prototype.findJobData = function() { 
    return this.website.findJobData(); 
}; 

Crawler.prototype.collectJobData = function() { 
    var page = require('webpage').create(); 
    page.onConsoleMessage = function(msg) { console.log(msg) }; 

    page.open('URL', function (status) { 
     page.includeJs("https://ajax.googleapis.com/ajax/libs/jquery/3.1.0/jquery.min.js", function() { 
      var temp_jobs = page.evaluate(this.findJobs()); 

       /* 
       var jobs = []; 
       var job; 
        $('ul.job-list').each(function(){ 
        $(this).find('li').each(function(){ 
         var job_link = $(this).find('a'); 
         var url = "URL" + job_link.attr("href"); 
         var location = $(this).find('span').text(); 

         job = {title: job_link.text(), url: url, location: location, description: ""} 
         jobs.push(job); 
         console.log(job.title, job.url, job.location); 
        }) 
       }); 
       return jobs;*/ 
      console.log(temp_jobs[0].title) 

      phantom.exit(0); 
     }); 
    }); 

}; 

var strategy_a = function() { 

    this.findJobs = function() { 
      var jobs = []; 
      var job; 
      $('ul.job-list').each(function(){ 
       $(this).find('li').each(function(){ 
        var job_link = $(this).find('a'); 
        var url = "URL" + job_link.attr("href"); 
        var location = $(this).find('span').text(); 

        job = {title : job_link.text(), url : url, location : location, description : ""}; 
        jobs.push(job); 
        console.log(job.title, job.url, job.location); 
       }) 
      }); 
      return jobs; 
    }; 
}; 


var strategy_a = new strategy_a(); 
var crawler = new Crawler(); 

crawler.setStrategy(strategy_a); 
crawler.collectJobData(); 

回答

1

你有两个问题:

  • 你想用page.evaluate(this.findJobs);而不是page.evaluate(this.findJobs());

  • thispage.includeJs回调里面是不是一个Crawler实例的引用。

这应该工作:您已经生成多页,而使用所有这些

Crawler.prototype.collectJobData = function() { 
    var page = this.page; 
    var self = this; 
    page.onConsoleMessage = function(msg) { console.log(msg) }; 

    page.open('URL', function (status) { 
     page.includeJs("https://ajax.googleapis.com/ajax/libs/jquery/3.1.0/jquery.min.js", function() { 
      var temp_jobs = page.evaluate(self.website.findJobs); 
      console.log(temp_jobs[0].title) 

      phantom.exit(0); 
     }); 
    }); 
}; 

注意,所以我删除了第二require('webpage').create()

+0

我收到以下错误,当我运行代码:/类型错误:不确定是不是(评估“this.website.findJobs”) 未定义的对象:2 :3 – Pierre

+1

你'Crawler.prototype.findJobData'功能是无用的,因为具体的'findJobs'函数必须被设置为直接在页面上下文中操作。您不能使用'findJobData'为'findJobs'执行代理,因为'page.evaluate'是沙箱,并且不允许在页面上下文之外进行引用。无论如何,我确定了我的答案。 –

+0

感谢您的澄清。 – Pierre