如果您想要爬网,你可以看看CrawlSpider
,从scrapy
。我也使用lxml.html
只是因为它提供了更多的灵活性。
要安装这些库,你可以使用:
pip install scrapy
pip install lxml
脚手架基本scrapy项目,您可以使用command:
scrapy startproject elections
然后添加蜘蛛和项目:
选举/蜘蛛/ spider.py
from scrapy.spiders import CrawlSpider, Rule
from elections.items import ElectionsItem
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.selector import Selector
from lxml import html
class ElectionsSpider(CrawlSpider):
name = "elections"
allowed_domains = ["elections.ca"]
start_urls = ["http://www.elections.ca/WPAPPS/WPR/EN/NC/Details?province=-1&distyear=2013&district=-1&party=-1&pageno=1&totalpages=55&totalcount=1372&viewall=1"]
rules = (
Rule(LxmlLinkExtractor(
allow = ('http://www.elections.ca/WPAPPS/WPR/EN/NC/Details.*'),
),
callback='parse_item',
follow=True
),
)
def unindent(self, string):
return ''.join(map(str.strip, string.encode('utf8').splitlines(1)))
def parse_item(self, response):
item = ElectionsItem()
original_html = Selector(response).extract()
lxml_obj = html.fromstring(original_html)
for entry in lxml_obj.xpath('.//fieldset[contains(@class,"wpr-detailgroup")]'):
date = entry.xpath('.//legend[contains(@class,"wpr-ltitle")]/span[contains(@class,"date")]')
if date:
item['date'] = self.unindent(date[0].text.strip())
party = entry.xpath('.//legend[contains(@class,"wpr-ltitle")]')
if party:
item['party'] = self.unindent(party[0].text.strip())
start_date = entry.xpath('.//div[contains(@class,"group")]/span[contains(@class,"date")][1]')
if start_date:
item['start_date'] = self.unindent(start_date[0].text.strip())
end_date = entry.xpath('.//div[contains(@class,"group")]/span[contains(@class,"date")][2]')
if end_date:
item['end_date'] = self.unindent(end_date[0].text.strip())
electoral_district = entry.xpath('.//div[contains(@class,"wpr-title")][contains(text(),"Electoral district:")]')
if electoral_district:
item['electoral_district'] = self.unindent(electoral_district[0].tail.strip())
registered_association = entry.xpath('.//div[contains(@class,"wpr-title")][contains(text(),"Registered association:")]')
if registered_association:
item['registered_association'] = self.unindent(registered_association[0].tail.strip())
for candidate in entry.xpath('.//table[contains(@class, "wpr-datatable")]//tr[not(@class)]'):
item['elected'] = len(candidate.xpath('.//img[contains(@alt, "contestant won this nomination contest")]'))
candidate_name = candidate.xpath('.//td[contains(@headers,"name")]')
if candidate_name:
item['candidate_name'] = self.unindent(candidate_name[0].text.strip())
item['address'] = self.unindent(candidate.xpath('.//td[contains(@headers,"address")]')[0].text_content().strip())
item['financial_agent'] = self.unindent(candidate.xpath('.//td[contains(@headers,"fa")]')[0].text_content().strip())
yield item
选举/项目。PY
from scrapy.item import Item, Field
class ElectionsItem(Item):
date = Field()
party = Field()
start_date = Field()
end_date = Field()
electoral_district = Field()
registered_association = Field()
elected = Field()
candidate_name = Field()
address = Field()
financial_agent = Field()
选举/ settings.py
BOT_NAME = 'elections'
SPIDER_MODULES = ['elections.spiders']
NEWSPIDER_MODULE = 'elections.spiders'
ITEM_PIPELINES = {
'elections.pipelines.ElectionsPipeline': 300,
}
选举/ pipelines.py
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
from scrapy.exporters import CsvItemExporter
class electionsPipeline(object):
def __init__(self):
dispatcher.connect(self.spider_opened, signals.spider_opened)
dispatcher.connect(self.spider_closed, signals.spider_closed)
self.files = {}
def spider_opened(self, spider):
file = open('%s_ads.csv' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = CsvItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
您可以[R un蜘蛛通过运行command:
scrapy runspider elections/spiders/spider.py
从您的项目的根。
应该在项目中创建的根elections.csv
,像这样:
financial_agent,end_date,candidate_name,registered_association,electoral_district,elected,address,date,party,start_date
"Jan BalcaThornhill, OntarioL4J 1V9","September 09, 2015",Leslyn Lewis,,Scarborough--Rouge Park,1,"Markham, OntarioL6B 0K9","September 09, 2015",,"September 07, 2015"
"Mark HicksPasadena, Newfoundland and LabradorA0L 1K0","September 08, 2015",Roy Whalen,,Long Range Mountains,1,"Deer Lake, Newfoundland and LabradorA8A 3H6","September 08, 2015",,"August 21, 2015"
,"September 08, 2015",Wayne Ruth,,Long Range Mountains,0,"Kippens, Newfoundland and LabradorA2N 3B8","September 08, 2015",,"August 21, 2015"
,"September 08, 2015",Mark Krol,,St. John's South--Mount Pearl,1,"Woodbridge, OntarioL4L 1Y5","September 08, 2015",,"August 24, 2015"
,"September 08, 2015",William MacDonald Alexander,,Bow River,1,"Calgary, AlbertaT2V 0M1","September 08, 2015",,"September 04, 2015"
(...)
你想凑所有条目?或者你想过滤一个搜索条件(省/领土,再分配年,选举区,政党,协会关键词,参赛者关键词,比赛日期)? –
您使用哪个分隔符?另外,如果您在某些字段中有分隔符,是否使用了quotechar? – Fejs
@IvanChaer我想根本没有任何过滤的东西,截至目前,我的代码可以做到这一点 - 这只是一个问题,获取存储在每个页面上的所有信息,再加上csv输出问题。 – HowenWilson