0
因此,我编写了一些代码来获取有关大学课程的数据以构建交互式调度程序。这是我要得到数据的代码:优化硒代码
from selenium import webdriver
import os
import pwd
import shlex
import re
import time
usr = pwd.getpwuid(os.getuid()).pw_name
Path = ('/Users/%s/Downloads/chromedriver') %usr # Have chromedriver dowloaded
# Create a new instance of the Chrome driver
options = webdriver.ChromeOptions()
options.binary_location = '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'
options.add_argument('headless') # Headless so no window is opened
options.add_argument('window-size=1200x600')
driver = webdriver.Chrome(Path, chrome_options=options)
driver.get('https://web.stevens.edu/scheduler/core/2017F/2017F.xml') # Go to database
classes = {}
def Database(AllSelectedCourseInfo):
ClassDict = {}
for item in AllSelectedCourseInfo: # Go through list of class info
try:
thing = item.split("=") # Split string by = to get subject name and value
name = thing[0]
if any(char.isdigit() for char in thing[1]): # Get rid of annoying Z at the end of numbers
thing[1] = re.sub("[Z]","",thing[1])
value = thing[1]
if value: # If subject has a value, store it
ClassDict[str(name)] = str(value) # Store value in a dictionary with the subject as the key
except:
pass
classes[str(ClassDict["Section"])] = ClassDict # Add to dictionary
def makeDatabase(section):
if "Title" in driver.find_element_by_xpath("//*[text()='%s']"%section).find_element_by_xpath("..").text:
classSection = driver.find_elements_by_xpath("//*[text()='%s']"%section) # If class name given find class
for i in range(0, len(classSection)):
AllSelectedCourseInfo = shlex.split(classSection[i].find_element_by_xpath(".." + "/.."*4).text.replace("/>", "").replace(">", "")) # sort into a list grouping string in quotes and getting rid of unnecessary symbols
Database(AllSelectedCourseInfo)
else:
classSection = driver.find_element_by_xpath("//*[text()='%s']"%section) # If class section give, find class
AllSelectedCourseInfo = shlex.split(classSection.find_element_by_xpath(".." + "/.."*3).text.replace("/>", "").replace(">", "")) # sort into a list grouping string in quotes and getting rid of unnecessary symbols
Database(AllSelectedCourseInfo)
def printDic():
for key in classes:
print "\n-------------%s------------" %key
for classkey in classes[key]:
print "%s : %s" %(classkey, classes[key][classkey])
start = time.time()
makeDatabase("Differential Calculus")
makeDatabase("MA 124B")
printDic()
end = time.time()
print end - start
driver.quit()
需要约20秒钟,我从一个类和一个类部分提取数据,如果我让这个实际它是将需要至少7类,这将花费一分钟才能创建词典。有谁知道一种方法可以让这个运行更快吗?
Selenium真的有必要吗?你不能使用请求来下载xml,然后像BeautifulSoup这样的库来解析它吗? – perfect5th
我使用硒,因为它是我曾经使用过的。我会研究一下,如果BeautifulSoup在这种情况下效果更好。 – Jake
硒在这里可能会过度杀毒,因为您真的只抓取一个xml页面。另请查看请求http://docs.python-requests.org/en/master/获取页面的简单方法。 – perfect5th