2016-07-27 160 views
0
import urllib2  
from selenium import webdriver  
from selenium.webdriver.common.by import By  
from selenium.webdriver.common.keys import Keys 

url = ("http://www.justdial.com/Mumbai/CA")  
driver = webdriver.Firefox()  
driver.get(url) 

driver 

elements = driver.find_elements_by_xpath('//div[@class="col-md-12 col-xs-12 padding0"]') 

for e in elements:  
    print e.text  
url = driver.current_url  
company_name = driver.find_element_by_xpath('//span[@class="jcn"]').text 

contact_number = driver.find_element_by_xpath('//p[@class="contact_info"]').text  
address = driver.find_element_by_xpath('//p[@class="adress_info"]').text  
address_info = driver.find_element_by_xpath('//p[@class="address-info adinfoex"]').text 

estd = driver.find_element_by_xpath('//li[@class="fr"]').text  
ratings = driver.find_element_by_xpath('//li[@class="last"]').text 

tf = 'textfile.csv'  
f2 = open(tf, 'a+') 

f2.write(', '.join([data.encode('utf-8') for data in [company_name]]) + ',')  
f2.write(', '.join([data.encode('utf-8') for data in [contact_number]]) + ',')  
f2.write(', '.join([data.encode('utf-8') for data in [address]]) + ',')  
f2.write(', '.join([data.encode('utf-8') for data in [address_info]]) + ',')  
f2.write(', '.join([data.encode('utf-8') for data in [estd_ratings]]) + '\n') 

f2.close() 

回答

0

下面应该让你开始。重要的是要确保你的xpath条目准确地选择你所需要的。 Python的csv module可以用来自动获取到逗号分隔的条目列表转换没有你需要添加自己的逗​​号:

import csv 
import urllib2  
from selenium import webdriver  
from selenium.webdriver.common.by import By  
from selenium.webdriver.common.keys import Keys 

def get_elements_by_xpath(driver, xpath): 
    return [entry.text for entry in driver.find_elements_by_xpath(xpath)] 


url = ("http://www.justdial.com/Mumbai/CA")  
driver = webdriver.Firefox()  
driver.get(url) 

search_entries = [ 
    ("CompanyName",  "//span[@class='jcn']"), 
    ("ContactNumber", "//p[@class='contact-info']/span/a"), 
    ("Address",   "//span[@class='desk-add jaddt']"), 
    ("AddressInfo",  "//p[@class='address-info adinfoex']"), 
    ("Estd",   "//li[@class='fr']"), 
    ("Ratings",   "//li[@class='last']/a/span[@class='rt_count']")] 

with open('textfile.csv', 'wb') as f_output: 
    csv_output = csv.writer(f_output) 

    # Write header 
    csv_output.writerow([name for name, xpath in search_entries]) 
    entries = [] 

    for name, xpath in search_entries: 
     entries.append(get_elements_by_xpath(driver, xpath)) 

    csv_output.writerows(zip(*entries)) 

这会给你一个CSV文件看起来像:

CompanyName,ContactNumber,Address,AddressInfo,Estd,Ratings 
Bansal Investment & Consult...,+(91)-22-38578062,Manpada-thane West.. | more..,"CA, Tax Consultants, more...",Estd.in 2003,27 Ratings 
G.Kedia & Associates,+(91)-22-38555914,"Station Road, Thane We.. | more..","CA, Company Registration Consultants, more...",Estd.in 2010,17 Ratings 
Tarun Shah & Associates,+(91)-22-38552775,"Mogra Lane, Andheri Ea.. | more..","CA, Income Tax Consultants, more...",Estd.in 2000,12 Ratings 
Hemant Shah And Associates LLP,+(91)-22-38588696,"Azad Road, Andheri Eas.. | more..","CA, Company Secretary, more...",Estd.in 1988,65 Ratings 

的循环请求每个xpath搜索并为每个搜索创建一个数组条目。每个搜索都会返回一个匹配数组,以便最终生成一组条目数组。

这需要写入CSV文件。 entries按列顺序排列,并且需要按行顺序写入CSV文件。为此,zip(*entries)用于转换为行顺序。由于整个阵列现在按正确顺序排列,因此可以使用一次调用writerows来一次写入整个文件。

使用Python的CSV库的额外好处是,如果任何字段包含逗号,它将自动在该字段周围添加引号以确保Excel不会将其解释为另一列。请注意,当Excel将尝试猜测时,您可能需要更改默认的单元格类型格式。