1
解析google图片搜索结果时出现问题。我试过用selenium webdriver
来做。它返回了100个结果,但速度很慢。我决定用requests
模块申请一个页面,它只返回了20个结果。我怎样才能得到相同的100个结果?有没有办法分页或什么?
这selenium
代码:解析google图片搜索结果
_url = r'imgurl=([^&]+)&'
for search_url in lines:
driver.get(normalize_search_url(search_url))
images = driver.find_elements(By.XPATH, u"//div[@class='rg_di']")
print "{0} results for {1}".format(len(images), ' '.join(driver.title.split(' ')[:-3]))
with open('urls/{0}.txt'.format(search_url.strip().replace('\t', '_')), 'ab') as f:
for image in images:
url = image.find_element(By.TAG_NAME, u"a")
u = re.findall(_url, url.get_attribute("href"))
for item in u:
f.write(item)
f.write('\n')
这里是requests
代码:
_url = r'imgurl=([^&]+)&'
for search_url in lines[:10]:
print normalize_search_url(search_url)
links = 0
request = requests.get(normalize_search_url(search_url))
soup = BeautifulSoup(request.text)
file = 'cars2/{0}.txt'.format(search_url.strip().replace(' ', '_'))
with open(file, 'ab') as f:
for image in soup.find_all('a'):
if 'imgurl' in image.get('href'):
links += 1
u = re.findall(_url, image.get("href"))
for item in u:
f.write(item)
f.write('\n')
print item
print "{0} links extracted for {1}".format(links, ' '.join(soup.title.name.split(' ')[:-3]))