2
我想在初始化时使用Scrapy登录到一个网站,然后在确认登录后我想初始化并通过start_urls启动标准爬网。林不知道什么是错的,但我清楚登录,每一件事情证实,但parse_item永远不会启动。任何帮助将不胜感激。Scrapy InIt self.initialized() - 不初始化
我可以得到它“================成功登录=================”
但
我无法进入“========================== PARSE ITEM ========= =================“
from scrapy.contrib.spiders.init import InitSpider
from scrapy.http import Request, FormRequest
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import Rule
from selenium import webdriver
class ProductDetailsSpider(InitSpider):
name = 'product_details_spider'
allowed_domains = ['my_domain.com']
login_page = 'http://www.my_domain.com/'
start_urls = ['http://www.my_domain.com/nextpage1/',
'http://www.my_domain.com/nextpage2/',
'http://www.my_domain.com/nextpage3/']
rules = (
Rule(SgmlLinkExtractor(allow=()),
callback='parse_item',
follow=True),
)
def get_cookies(self):
driver = webdriver.Firefox()
driver.implicitly_wait(30)
base_url = "http://www.my_domain.com"
driver.get(base_url + "/")
driver.find_element_by_name("USR").clear()
driver.find_element_by_name("USR").send_keys("my_user")
driver.find_element_by_name("PASSWRD").clear()
driver.find_element_by_name("PASSWRD").send_keys("my_pass")
driver.find_element_by_name("submit").click()
cookies = driver.get_cookies()
driver.close()
cookie_dic = {}
for c in cookies:
cookie_dic[c['name']] = c['value']
return cookie_dic
def init_request(self):
print '=======================INIT======================='
"""This function is called before crawling starts."""
return Request(url=self.login_page, callback=self.login)
def login(self, response):
print '=======================LOGIN======================='
"""Generate a login request."""
return [FormRequest.from_response(response,formname='login_form',
formdata={'USR': 'my_user', 'PASSWRD': 'my_pass'},
callback=self.login_cookies)]
def login_cookies(self, response):
print '=======================COOKIES======================='
return Request(url='http://www.my_domain.com/home',
cookies=self.get_cookies(),
callback=self.check_login_response)
def check_login_response(self, response):
print '=======================CHECK LOGIN======================='
"""Check the response returned by a login request to see if we are
successfully logged in.
"""
if "Logoff" in response.body:
print "=========Successfully logged in.========="
self.initialized()
# Now the crawling can begin..
else:
print "==============Bad times :(==============="
# Something went wrong, we couldn't log in, so nothing happens.
def parse_item(self, response):
print "==============PARSE ITEM=========================="
# Scrape data from page
非常感谢,那是我遇到的问题的解决方案! – Murph