2017-08-01 48 views
0

我是编程新手,我创建了一个使用美丽的汤蟒蛇webscraper,但是当我运行这个程序时,它打开python命令行,只是光标眨眼就可以了,什么也没有发生......现在我收到这些错误Beautifulsoup蟒蛇错误(10060)紧急

TimeoutError:[WinError 10060]连接尝试失败,因为连接的方没有正确一段时间后响应或已建立的连接失败,因为连接的主机没有反应

ConnectionResetError :[WinError 10054]现有连接被远程主机强制关闭

...请不要介意压痕,

下面

是我的代码:

import urllib.request 
import urllib 
import json 
import xml.etree.ElementTree as ET 
import csv 
from bs4 import BeautifulSoup 

link = 'https://maharerait.mahaonline.gov.in/searchlist/searchlist' 
talukaLink = "https://maharerait.mahaonline.gov.in/SearchList/GetTaluka" 
distlink = "https://maharerait.mahaonline.gov.in/SearchList/GetDistrict" 
prjLink = "https://maharerait.mahaonline.gov.in/SearchList/GetProjectName" 

alldata = [] 

links = {} 
certificatedata = [] 

def getData(url, values): 
    data = urllib.parse.urlencode(values) 
    data = data.encode('utf-8') 
    req = urllib.request.Request(url, data) 
    response=urllib.request.urlopen(req) 
    data = response.read() 
    data = data.decode("utf-8") 
    return data 


def getDivsion(): 
    ## for now we are taking 6 districts.. it needs to updated when the data 
gets updatedd 
    return range(1,7) 

    def getDistrict(divId): 
     global distlink 
     values = {'DivID': divId} 
     data = getData(distlink, values) 
    return data 

def parseJson(data): 
    parsed = json.loads(data) 
    return parsed 

def getTaluka(disId): 
    global talukaLink 
    values= {'DisID': disId} 
    data = getData(talukaLink, values) 
    return data 

def getProjects(divId, disId): 
    global prjLink 
    values= {'DisID': disId, 'DivID': divId} 
    #print(values) 
    data = getData(prjLink, values) 
    if len(data)<10: 
    return "{}" 
return data 

def getProjectsList(): 
    divList = getDivsion() 
    flag = 0 
    for divId in divList: 
     disData = getDistrict(divId) 
     disList = parseJson(disData) 
     for disObj in disList: 
      disId = disObj["ID"] 
      prjData = getProjects(divId, disId) 
     #print(" >>>> "+str(disId)+" >> "+str(divId)) 
     #print(prjData) 
     prjJson = parseJson(prjData) 
     for prjObj in prjJson: 
      flag += 1 
      prjId = prjObj["ID"] 
      values = {'ID':0, 'pageTraverse': 1, 'Division': divId,  'hdnDistrict': '', 'hdnProject':'', 'District': disId, 'Taluka':'', 'Village': '', 'Project': prjId, 'CertiNo':'', 'btnSearch':'Search'} 
      finalPrjData = getData(link, values) 
      parseXMLData(finalPrjData) 
      #if len(alldata)>100: 
      # break 

def parseXMLData(htmldata): 
    global alldata, links 
    soup = BeautifulSoup(htmldata, "html.parser") 
    tables = soup.find_all("table") 
    for table in tables: 
     print(len(alldata)) 
    attr = table.attrs 
    if "table" in attr['class']: 
     tbody = table.find_all("tbody") 
     if len(tbody)>0: 
      tbody = tbody[0] 
      tr_lst = tbody.find_all("tr") 
      for tr in tr_lst: 
       sublist = [] 
       td_lst = tr.find_all("td") 
       if len(td_lst)>6: 
        prjname = td_lst[1].text 
        proname = td_lst[2].text 
        certNo = td_lst[3].text 
        sublist.append(prjname) 
        sublist.append(proname) 
        sublist.append(certNo) 
        td = td_lst[4] 
        a_lst = td.find_all("a") 
        if len(a_lst)>0: 
         a = a_lst[0] 
         href = a.attrs['href'] 
         link = "https://maharerait.mahaonline.gov.in/"+href 
         links[certNo] = link 
         sublist.append(link) 
       if len(sublist)>0: 
        alldata.append(sublist) 
return alldata 


def writedata(alldata1, filename): 
    print(" >>>> FINAL PRINTING DATA >>>> ") 
    #import pdb; pdb.set_trace() 
    with open("./"+filename,'w') as csvfile: 
     csvfile = csv.writer(csvfile, delimiter=',') 
     #csvfile.writerow(titleRow) 
     csvfile.writerow("") 
     for i in range(0, len(alldata1)): 
      #print(alldata1[i]) 
      csvfile.writerow(alldata1[i] ) 


def processlinksforcert(): 
    global links, certificatedata 
    print(">> Came in fetching certificates data >>> ") 
    for certno in links.keys(): 
     link = links[certno] 
     htmldata = getData(link, {}) 
     soup = BeautifulSoup(htmldata, "html.parser") 
     divs = soup.find_all("div") 
     for div in divs: 
      attr = div.attrs 
     if "id" in attr.keys() and "DivProfessional" in attr['id']: 
      table = div.find_all("table") 
      if len(table)<=0: 
       continue 
      t_attr = table[0].attrs 
      if "table" in t_attr["class"]: 
       print(len(certificatedata)) 
       table = table[0] 
       tr_lst = table.find_all("tr") 
       index = 1 
       while index<len(tr_lst): 
        #import pdb; pdb.set_trace() 
        #for tr in tr_lst: 
        #if index==0: 
        # continue 
        tr = tr_lst[index] 
        index += 1 
        sublist = [] 
        td_lst = tr.find_all("td") 
        if len(td_lst)>2: 
         sublist.append(certno) 
         pername = formattext(td_lst[0].text) 
         cerno = formattext(td_lst[1].text) 
         proftype = formattext(td_lst[2].text) 
         sublist.append(pername) 
         sublist.append(cerno) 
         sublist.append(proftype) 
         certificatedata.append(sublist) 
return certificatedata 

def formattext(text): 
    while text.find("\r\n")>=0: 
     text = text.replace("\r\n","") 

while text.find(" ")>=0: 
    text = text.replace(" ","") 
return text 

def main(): 
    global alldata, certificatedata 
    #data = getData(url, {}) 
    getProjectsList() 
    print("Before write the projects data to the file. Count >> 
"+str(len(alldata))) 
    writedata(alldata, "data.csv") 
    data = processlinksforcert() 
    print("Before write the certificates data to the file. Count >> 
"+str(len(data))) 
    writedata(data, "certificates.csv") 


main() 

可以随心所欲有人建议我在做什么错了......我把一切都安装了PIP画中画beautifulsoup也..请不要介意缩进,它只是为了这里....

+1

我觉得问题在于我甚至无法从我的浏览器“https://maharerait.mahaonline.gov.in/ SearchList/GetTaluka”访问您的一些网址。我想可能你必须通过cookie或登录信息到你的目的站点才能让代码获取数据。到目前为止,它无法访问源,这就是为什么它在到期时间后崩溃。 – Grynets

+0

但我在excel vba中为同一个网站创建了奇才,它工作正常......但我注意到当我ping maharerait.mahaonline.gov.in时,它返回100%损失 –

+1

您询问了有关建议,并且因为我看到问题不是与您的代码。这个特殊的网站存在问题,因为正如你所说的,你会得到100%的损失。这就是脚本无法正常工作的原因。您需要调查您的代码和Excel VBA代码之间的主要区别。在这里,我无法帮助,因为我没有使用Excel VBA的经验。但是,我建议这个问题隐藏在接近cookies或认证的地方。 – Grynets

回答

0

我解决了它通过使用硒。非常感谢大家