2012-06-30 46 views
2

我想在初始化时使用Scrapy登录到一个网站,然后在确认登录后我想初始化并通过start_urls启动标准爬网。林不知道什么是错的,但我清楚登录,每一件事情证实,但parse_item永远不会启动。任何帮助将不胜感激。Scrapy InIt self.initialized() - 不初始化

我可以得到它“================成功登录=================”

我无法进入“========================== PARSE ITEM ========= =================“

from scrapy.contrib.spiders.init import InitSpider 
from scrapy.http import Request, FormRequest 
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 
from scrapy.contrib.spiders import Rule 
from selenium import webdriver 

class ProductDetailsSpider(InitSpider): 
    name = 'product_details_spider' 
    allowed_domains = ['my_domain.com'] 
    login_page = 'http://www.my_domain.com/' 
    start_urls = ['http://www.my_domain.com/nextpage1/', 
        'http://www.my_domain.com/nextpage2/', 
        'http://www.my_domain.com/nextpage3/'] 

    rules = (
     Rule(SgmlLinkExtractor(allow=()), 
      callback='parse_item', 
      follow=True), 
     ) 

    def get_cookies(self): 
     driver = webdriver.Firefox() 
     driver.implicitly_wait(30) 
     base_url = "http://www.my_domain.com" 
     driver.get(base_url + "/") 
     driver.find_element_by_name("USR").clear() 
     driver.find_element_by_name("USR").send_keys("my_user") 
     driver.find_element_by_name("PASSWRD").clear() 
     driver.find_element_by_name("PASSWRD").send_keys("my_pass") 
     driver.find_element_by_name("submit").click() 
     cookies = driver.get_cookies() 
     driver.close() 
     cookie_dic = {} 
     for c in cookies: 
      cookie_dic[c['name']] = c['value'] 
     return cookie_dic 

    def init_request(self): 
     print '=======================INIT=======================' 
     """This function is called before crawling starts.""" 
     return Request(url=self.login_page, callback=self.login) 

    def login(self, response): 
     print '=======================LOGIN=======================' 
     """Generate a login request.""" 
     return [FormRequest.from_response(response,formname='login_form', 
      formdata={'USR': 'my_user', 'PASSWRD': 'my_pass'}, 
      callback=self.login_cookies)] 

    def login_cookies(self, response): 
     print '=======================COOKIES=======================' 
     return Request(url='http://www.my_domain.com/home', 
      cookies=self.get_cookies(), 
      callback=self.check_login_response) 

    def check_login_response(self, response): 
     print '=======================CHECK LOGIN=======================' 
     """Check the response returned by a login request to see if we are 
     successfully logged in. 
     """ 
     if "Logoff" in response.body: 
      print "=========Successfully logged in.=========" 
      self.initialized() 
      # Now the crawling can begin.. 
     else: 
      print "==============Bad times :(===============" 
      # Something went wrong, we couldn't log in, so nothing happens. 

    def parse_item(self, response): 
     print "==============PARSE ITEM==========================" 
    # Scrape data from page 

回答

3

我对晚会有点迟,但我确信你需要返回self.intialized ():

if "Logoff" in response.body: 
    print "=========Successfully logged in.=========" 
    return self.initialized() 
    # Now the crawling can begin.. 
+0

非常感谢,那是我遇到的问题的解决方案! – Murph