0
Created on Apr 19, 2016
@author: harshitha
import re
import urllib
from urllib import urlopen
from bs4 import BeautifulSoup
import urllib2
import csv
import sys
print(sys.version_info)
print(sys.version)
f = open('/home/harshitha/Documents/house_database.csv', 'w')
zz=0
f.write('Zipcode')
f.write(',')
f.write('Bedrooms')
f.write(',')
f.write('Bathrooms')
f.write(',')
f.write('Square_Footage')
f.write(',')
f.write('Price_Per_SqFt')
f.write(',')
f.write('Lot_Size')
f.write(',')
f.write('Stories')
f.write(',')
f.write('Property_Type')
f.write(',')
f.write('Year_Built')
f.write(',')
f.write('MLS')
f.write(',')
f.write('Neighborhood')
f.write(',')
f.write('County')
f.write(',')
f.write('Monthly_Est_Motgage ')
f.write(',')
f.write('Monthly_Est_Insurance')
f.write(',')
f.write('Last_Updated')
f.write(',')
f.write('Last_Sold_Date')
f.write(',')
f.write('Last_Sold_Price')
f.write(',')
f.write('\n')
# Get all the zipcodes
with open('/home/harshitha/Documents/zip.csv','rU') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
zipcode = row['ZipCode']
zz = zz + 1
if zz <3789:
continue
#from the link
link = 'http://www.homes.com/property/1416-church-st-san-francisco- ca-94131/id-100013343668/'
html = urlopen(link).read()
parsed_html = BeautifulSoup(html)
#print parsed_html
Bedrooms = '--'
Bathrooms = '--'
Square_Footage = '--'
Price_Per_SqFt = '--'
Lot_Size = '--'
Stories = '--'
Property_Type = '--'
Year_Built = '--'
MLS = '--'
Neighborhood = '--'
County = '--'
Monthly_Est_Motgage = '--'
Monthly_Est_Insurance = '--'
Last_Updated = '--'
Last_Sold_Date = '--'
Last_Sold_price = '--'
try:
data = parsed_html.find("dt", text=re.compile("Bedroom(s)"))
if data is not None:
Bedrooms = data.parent.findNextSibling("dd").text
data = parsed_html.find("dt", text=re.compile("Bathroom(s)"))
if data is not None:
Bathrooms = data.parent.findNextSibling("dd").text
data = parsed_html.find("dt", text=re.compile("Square Footage"))
if data is not None:
Square_Footage = data.parent.findNextSibling("dd").text
data = parsed_html.find("dt", text=re.compile("Price Per SqFt "))
if data is not None:
Price_Per_SqFt = data.parent.findNextSibling("dd").text
data = parsed_html.find("dt", text=re.compile("Lot Size"))
if data is not None:
Lot_Size = data.parent.findNextSibling("dd").text
data = parsed_html.find("dt", text=re.compile("Stories"))
if data is not None:
data_field = data.parent.findNextSibling("dd")
if data_field is not None:
Stories = BeautifulSoup(''.join(data_field[0])).text
data = parsed_html.find("dt", text=re.compile("Property Type"))
if data is not None:
data_field = data.parent.findNext("dt")
data_field = data.parent.findNextSibling("dd")
if data_field is not None:
Apt = BeautifulSoup(''.join(data_field[0])).text
condo = BeautifulSoup(''.join(data_field[0])).text
data = parsed_html.find("dt", text=re.compile("YearBuilt"))
if data is not None:
Year_Built = data.parent.findNextSibling("dd").text
data = parsed_html.find("dt", text=re.compile("MLS"))
if data is not None:
data_field = data.parent.findNextSibling("dd")
if data_field is not None:
MLS = BeautifulSoup(''.join(data_field[0])).text
data = parsed_html.find("dt", text=re.compile("Neighborhood"))
if data is not None:
data_field = data.parent.findNextSibling("dd")
if data_field is not None:
Neighborhood = BeautifulSoup(''.join(data_field[0])).text
data = parsed_html.find("dt", text=re.compile("County"))
if data is not None:
data_field = data.parent.findNextSibling("dd")
if data_field is not None:
county = BeautifulSoup(''.join(data_field[0])).text
data = parsed_html.find("dt", text=re.compile("Monthly Est Mortgage"))
if data is not None:
data_field = data.parent.findNextSibling("dd")
if data_field is not None:
Monthly_Est_Mortgage = BeautifulSoup(''.join(data_field[0])).text
data = parsed_html.find("dt", text=re.compile("Monthly Est Insurance"))
if data is not None:
data_field = data.parent.findNextSibling("dd")
if data_field is not None:
Monthly_Est_Insurance = BeautifulSoup(''.join(data_field[0])).text
data = parsed_html.find("dt", text=re.compile("Last Updated"))
if data is not None:
data_field= data.parent.findNextSibling("dd")
if data_field is not None:
Last_Updated = BeautifulSoup(''.join(data_field[0])).text
data = parsed_html.find("dt", text=re.compile("Last Sold Date"))
if data is not None:
data_field= data.parent.findNextSibling("dd")
if data_field is not None:
Last_Sold_Date = BeautifulSoup(''.join(data_field[0])).text
data = parsed_html.find("dt", text=re.compile("Last Sold Price"))
if data is not None:
data_field1 = data.parent.findNextSibling("dd")
if data_field1 is not None:
Last_Sold_Price = BeautifulSoup(''.join(data_field1[0])).text
except:
print ('')
f.write(zipcode + ',' + Bedrooms.replace(',','')+ ',' + Bathrooms.replace(',','') + ',' + Square_Footage.replace(',','')+ ',' + Price_Per_SqFt.replace(',','')+ ',' + Lot_Size(',','') + ',' + Stories.replace(',',''))
'''f.write(zipcode + ',' + Bedrooms.replace(',','')+ ',' + Bathrooms.replace(',','') + ',' + Square_Footage.replace(',','') \
+ ',' + Price_Per_SqFt.replace(',','')+ ',' + Lot_Size(',','') + ',' + Stories.replace(',','') \
+ ',' + Property_Type.replace(',','') + ',' +
可以在任何一个可以帮助我 – harshita
在最后一行是一个注释掉了一部分代码或一些差事? – MarcinWolny
首先,我认为身份认证没有正确安排。你应该手工修复它或使用像Sublim这样的编辑器。 其次,我想你正试图在'zip.csv'文件中找到**每个** zipcode的所有数据。我对吗?如果是这样的话,你就没有做正确的事情。 with语句中的for循环不断更新'zipcode'变量。也就是说,一旦for循环结束,'zipcode'变量将只保存'zip.csv'中的** last ** zipcode。 – DboyLiao