0
以下蜘蛛代码输出文件的格式为k.1375093834.0.txt
。我要的是形式kickstarter.com.1375093834.0.txt
来自命令行参数的蜘蛛
任何建议代码变化将是非常有益的
class shnurl(CrawlSpider):
name = "shnurl"
#start_urls = [
# "http://www.blogger.com"
# ]
rules = [
Rule(SgmlLinkExtractor(),follow=True, callback="parse")
]
def __init__(self, *args, **kwargs):
#Initialize the parent class.
super(shnurl, self).__init__(*args, **kwargs)
#Get the start URL from the command line.
self.start_urls = [kwargs.get('start_url')]
#Create a results file based on the start_url + current time.
self.fname = '{0}.{1}.{2}'.format(self.start_url[12], time.time(),'txt')
self.fileout = open(self.fname, 'w+')
#Create a logfile based on the start_url + current time.
#Log file stores the errors, debug & info prints.
logfname = '{0}.{1}.{2}'.format(self.start_url[12], time.time(),'log')
#log.start(logfile='./runtime.log', loglevel=log.INFO)
log.start(logfile=logfname, loglevel=log.INFO)
self.log('Output will be written to: {0}'.format(self.fname), log.INFO)
#End of constructor
使用的文件名: -
scrapy crawl shnurl -a start_url="https://www.kickstarter.com"
其......工作!!!!从未想过会这么简单..谢谢@ Talvalin – hanu
进一步继续我的蜘蛛代码如下: - http://notepad.cc/share/28pioGWQ5T。我想要的输出只是href,但我真的不明白这个东西在下面的output.http://notepad.cc/share/XoGbXIaPaP – hanu