2017-07-29 177 views
0

因此,我编写了一些代码来获取有关大学课程的数据以构建交互式调度程序。这是我要得到数据的代码:优化硒代码

from selenium import webdriver 
import os 
import pwd 
import shlex 
import re 
import time 


usr = pwd.getpwuid(os.getuid()).pw_name 
Path = ('/Users/%s/Downloads/chromedriver') %usr # Have chromedriver dowloaded 
# Create a new instance of the Chrome driver 
options = webdriver.ChromeOptions() 
options.binary_location = '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome' 
options.add_argument('headless') # Headless so no window is opened 
options.add_argument('window-size=1200x600') 
driver = webdriver.Chrome(Path, chrome_options=options) 


driver.get('https://web.stevens.edu/scheduler/core/2017F/2017F.xml') # Go to database 


classes = {} 

def Database(AllSelectedCourseInfo): 
    ClassDict = {} 

    for item in AllSelectedCourseInfo: # Go through list of class info 
     try: 
      thing = item.split("=") # Split string by = to get subject name and value 
      name = thing[0] 
      if any(char.isdigit() for char in thing[1]): # Get rid of annoying Z at the end of numbers 
       thing[1] = re.sub("[Z]","",thing[1]) 
      value = thing[1] 
      if value: # If subject has a value, store it 
       ClassDict[str(name)] = str(value) # Store value in a dictionary with the subject as the key 
     except: 
      pass 

    classes[str(ClassDict["Section"])] = ClassDict # Add to dictionary 


def makeDatabase(section): 


    if "Title" in driver.find_element_by_xpath("//*[text()='%s']"%section).find_element_by_xpath("..").text: 
     classSection = driver.find_elements_by_xpath("//*[text()='%s']"%section) # If class name given find class 

     for i in range(0, len(classSection)): 
      AllSelectedCourseInfo = shlex.split(classSection[i].find_element_by_xpath(".." + "/.."*4).text.replace("/>", "").replace(">", "")) # sort into a list grouping string in quotes and getting rid of unnecessary symbols 
      Database(AllSelectedCourseInfo) 

    else: 
     classSection = driver.find_element_by_xpath("//*[text()='%s']"%section) # If class section give, find class 
     AllSelectedCourseInfo = shlex.split(classSection.find_element_by_xpath(".." + "/.."*3).text.replace("/>", "").replace(">", "")) # sort into a list grouping string in quotes and getting rid of unnecessary symbols 
     Database(AllSelectedCourseInfo) 


def printDic(): 
    for key in classes: 
     print "\n-------------%s------------" %key 
     for classkey in classes[key]: 
      print "%s : %s" %(classkey, classes[key][classkey]) 

start = time.time() 
makeDatabase("Differential Calculus") 
makeDatabase("MA 124B") 
printDic() 
end = time.time() 

print end - start 

driver.quit() 

需要约20秒钟,我从一个类和一个类部分提取数据,如果我让这个实际它是将需要至少7类,这将花费一分钟才能创建词典。有谁知道一种方法可以让这个运行更快吗?

+2

Selenium真的有必要吗?你不能使用请求来下载xml,然后像BeautifulSoup这样的库来解析它吗? – perfect5th

+0

我使用硒,因为它是我曾经使用过的。我会研究一下,如果BeautifulSoup在这种情况下效果更好。 – Jake

+0

硒在这里可能会过度杀毒,因为您真的只抓取一个xml页面。另请查看请求http://docs.python-requests.org/en/master/获取页面的简单方法。 – perfect5th

回答

0

我试图将lxml和请求集成到我的代码中,但它没有找到我想要的。经过几天的尝试使用lxml来完成这一无果,我决定尝试与urllib beautifulsoup4。这个工作比我本来希望,

from bs4 import BeautifulSoup 
from HTMLParser import HTMLParser 
import urllib 
import shlex 
import re 
import time 

h = HTMLParser() 
page = urllib.urlopen('https://web.stevens.edu/scheduler/core/2017F/2017F.xml').read() # Get to database 
soup = BeautifulSoup(page) 

RawClassData = soup.contents[10].contents[0].contents[0].contents 

classes = {} 
backupClasses = {} 

def makeDatabase(): 


    for i in range(0, len(RawClassData)): # Parse through each class 
     try: 
      AllSelectedCourseInfo = shlex.split(h.unescape(str(RawClassData[i]).replace(">", " "))) # sort into a list grouping string in quotes and getting rid of unnecessary symbols 
      ClassDict = {} 

      for item in AllSelectedCourseInfo: # Go through list of class info 
       try: 
        thing = item.split("=") # Split string by = to get subject name and value 
        name = thing[0] 
        if any(char.isdigit() for char in thing[1]): # Get rid of annoying Z at the end of numbers 
         thing[1] = re.sub("[Z]","",thing[1]) 
        value = thing[1] 
        if value: # If subject has a value, store it 
         ClassDict[str(name)] = str(value) # Store value in a dictionary with the subject as the key 
       except: 
        pass 

      classes[str(ClassDict["section"])] = ClassDict 
     except: 
      pass 


def printDic(): 
    with open("Classes", "w") as f: 
     for key in classes: 
      f.write("\n-------------%s------------" %key) 
      for classkey in classes[key]: 
       f.write("\n%s : %s" %(classkey, classes[key][classkey])) 
      f.write("\n") 

def printSection(selection): 
    print "\n-------------%s------------" %selection 
    for classkey in classes[selection]: 
     print "%s : %s" %(classkey, classes[selection][classkey]) 

def printClass(selection): 
    try: 
     for key in classes: 
      if classes[key]["title"] == selection: 
       print "\n-------------%s------------" %key 
       for classkey in classes[key]: 
        print "%s : %s" %(classkey, classes[key][classkey]) 
    finally: 
     print "\n-------------%s------------" %selection 
     for classkey in classes[selection]: 
      print "%s : %s" %(classkey, classes[selection][classkey]) 

start = time.time() 

makeDatabase() 

end = time.time() 

printClass("Circuits and Systems") 
printClass("Differential Equations") 
printClass("Writing & Communications Collqm") 
printClass("Mechanics of Solids") 
printClass("Electricity & Magnetism") 
printClass("Engineering Design III") 
printClass("Freshman Quiz") 

printDic() 

print end - start 

这个新的代码创建的所有类的库,然后打印出所需的类,都在2秒。硒代码需要89秒才能为期望的课程建立图书馆并打印出来,我会说这稍微改进了......感谢大家完善第5条建议!