0
我是编程新手,我创建了一个使用美丽的汤蟒蛇webscraper,但是当我运行这个程序时,它打开python命令行,只是光标眨眼就可以了,什么也没有发生......现在我收到这些错误Beautifulsoup蟒蛇错误(10060)紧急
TimeoutError:[WinError 10060]连接尝试失败,因为连接的方没有正确一段时间后响应或已建立的连接失败,因为连接的主机没有反应
ConnectionResetError :[WinError 10054]现有连接被远程主机强制关闭
...请不要介意压痕,
下面是我的代码:
import urllib.request
import urllib
import json
import xml.etree.ElementTree as ET
import csv
from bs4 import BeautifulSoup
link = 'https://maharerait.mahaonline.gov.in/searchlist/searchlist'
talukaLink = "https://maharerait.mahaonline.gov.in/SearchList/GetTaluka"
distlink = "https://maharerait.mahaonline.gov.in/SearchList/GetDistrict"
prjLink = "https://maharerait.mahaonline.gov.in/SearchList/GetProjectName"
alldata = []
links = {}
certificatedata = []
def getData(url, values):
data = urllib.parse.urlencode(values)
data = data.encode('utf-8')
req = urllib.request.Request(url, data)
response=urllib.request.urlopen(req)
data = response.read()
data = data.decode("utf-8")
return data
def getDivsion():
## for now we are taking 6 districts.. it needs to updated when the data
gets updatedd
return range(1,7)
def getDistrict(divId):
global distlink
values = {'DivID': divId}
data = getData(distlink, values)
return data
def parseJson(data):
parsed = json.loads(data)
return parsed
def getTaluka(disId):
global talukaLink
values= {'DisID': disId}
data = getData(talukaLink, values)
return data
def getProjects(divId, disId):
global prjLink
values= {'DisID': disId, 'DivID': divId}
#print(values)
data = getData(prjLink, values)
if len(data)<10:
return "{}"
return data
def getProjectsList():
divList = getDivsion()
flag = 0
for divId in divList:
disData = getDistrict(divId)
disList = parseJson(disData)
for disObj in disList:
disId = disObj["ID"]
prjData = getProjects(divId, disId)
#print(" >>>> "+str(disId)+" >> "+str(divId))
#print(prjData)
prjJson = parseJson(prjData)
for prjObj in prjJson:
flag += 1
prjId = prjObj["ID"]
values = {'ID':0, 'pageTraverse': 1, 'Division': divId, 'hdnDistrict': '', 'hdnProject':'', 'District': disId, 'Taluka':'', 'Village': '', 'Project': prjId, 'CertiNo':'', 'btnSearch':'Search'}
finalPrjData = getData(link, values)
parseXMLData(finalPrjData)
#if len(alldata)>100:
# break
def parseXMLData(htmldata):
global alldata, links
soup = BeautifulSoup(htmldata, "html.parser")
tables = soup.find_all("table")
for table in tables:
print(len(alldata))
attr = table.attrs
if "table" in attr['class']:
tbody = table.find_all("tbody")
if len(tbody)>0:
tbody = tbody[0]
tr_lst = tbody.find_all("tr")
for tr in tr_lst:
sublist = []
td_lst = tr.find_all("td")
if len(td_lst)>6:
prjname = td_lst[1].text
proname = td_lst[2].text
certNo = td_lst[3].text
sublist.append(prjname)
sublist.append(proname)
sublist.append(certNo)
td = td_lst[4]
a_lst = td.find_all("a")
if len(a_lst)>0:
a = a_lst[0]
href = a.attrs['href']
link = "https://maharerait.mahaonline.gov.in/"+href
links[certNo] = link
sublist.append(link)
if len(sublist)>0:
alldata.append(sublist)
return alldata
def writedata(alldata1, filename):
print(" >>>> FINAL PRINTING DATA >>>> ")
#import pdb; pdb.set_trace()
with open("./"+filename,'w') as csvfile:
csvfile = csv.writer(csvfile, delimiter=',')
#csvfile.writerow(titleRow)
csvfile.writerow("")
for i in range(0, len(alldata1)):
#print(alldata1[i])
csvfile.writerow(alldata1[i] )
def processlinksforcert():
global links, certificatedata
print(">> Came in fetching certificates data >>> ")
for certno in links.keys():
link = links[certno]
htmldata = getData(link, {})
soup = BeautifulSoup(htmldata, "html.parser")
divs = soup.find_all("div")
for div in divs:
attr = div.attrs
if "id" in attr.keys() and "DivProfessional" in attr['id']:
table = div.find_all("table")
if len(table)<=0:
continue
t_attr = table[0].attrs
if "table" in t_attr["class"]:
print(len(certificatedata))
table = table[0]
tr_lst = table.find_all("tr")
index = 1
while index<len(tr_lst):
#import pdb; pdb.set_trace()
#for tr in tr_lst:
#if index==0:
# continue
tr = tr_lst[index]
index += 1
sublist = []
td_lst = tr.find_all("td")
if len(td_lst)>2:
sublist.append(certno)
pername = formattext(td_lst[0].text)
cerno = formattext(td_lst[1].text)
proftype = formattext(td_lst[2].text)
sublist.append(pername)
sublist.append(cerno)
sublist.append(proftype)
certificatedata.append(sublist)
return certificatedata
def formattext(text):
while text.find("\r\n")>=0:
text = text.replace("\r\n","")
while text.find(" ")>=0:
text = text.replace(" ","")
return text
def main():
global alldata, certificatedata
#data = getData(url, {})
getProjectsList()
print("Before write the projects data to the file. Count >>
"+str(len(alldata)))
writedata(alldata, "data.csv")
data = processlinksforcert()
print("Before write the certificates data to the file. Count >>
"+str(len(data)))
writedata(data, "certificates.csv")
main()
可以随心所欲有人建议我在做什么错了......我把一切都安装了PIP画中画beautifulsoup也..请不要介意缩进,它只是为了这里....
我觉得问题在于我甚至无法从我的浏览器“https://maharerait.mahaonline.gov.in/ SearchList/GetTaluka”访问您的一些网址。我想可能你必须通过cookie或登录信息到你的目的站点才能让代码获取数据。到目前为止,它无法访问源,这就是为什么它在到期时间后崩溃。 – Grynets
但我在excel vba中为同一个网站创建了奇才,它工作正常......但我注意到当我ping maharerait.mahaonline.gov.in时,它返回100%损失 –
您询问了有关建议,并且因为我看到问题不是与您的代码。这个特殊的网站存在问题,因为正如你所说的,你会得到100%的损失。这就是脚本无法正常工作的原因。您需要调查您的代码和Excel VBA代码之间的主要区别。在这里,我无法帮助,因为我没有使用Excel VBA的经验。但是,我建议这个问题隐藏在接近cookies或认证的地方。 – Grynets