2017-07-09 122 views
0

因此,我无法获取url的下一页的href链接。我起身去获取所有的文字以及标签中包含的内容,但似乎无法将我的头部包裹起来,去掉我不需要的文字,只是获取href并浏览页面。如何获得下一个分页'href'?

这里是我的代码:

import requests 
from bs4 import BeautifulSoup 
import webbrowser 
import time 

jobsearch = input("What type of job?: ") 
location = input("What is your location: ") 
url = ("https://ca.indeed.com/jobs?q=" + jobsearch + "&l=" + location) 
base_url = 'https://ca.indeed.com/' 

r = requests.get(url) 
rcontent = r.content 
prettify = BeautifulSoup(rcontent, "html.parser") 

filter_words = ['engineering', 'instrumentation', 'QA'] 
all_job_url = [] 
nextpages = [] 
filtered_job_links = [] 
http_flinks = [] 
flinks = [] 

def all_next_pages(): 
    pages = prettify.find_all('div', {'class':'pagination'}) 
    for next_page in pages: 
     next_page.find_all('a') 
     nextpages.append(next_page) 
     print(next_page) 

all_next_pages() 

回答

1

这是一种方式来获得搜索结果项目的链接。找到row result类,然后找到a标记,它包含您需要的所有信息。

import requests 
from bs4 import BeautifulSoup 
import webbrowser 
import time 

jobsearch = input("What type of job?: ") 
location = input("What is your location: ") 
url = ("https://ca.indeed.com/jobs?q=" + jobsearch + "&l=" + location) 
base_url = 'https://ca.indeed.com/' 

r = requests.get(url) 
rcontent = r.text 
prettify = BeautifulSoup(rcontent, "lxml") 

filter_words = ['engineering', 'instrumentation', 'QA'] 
all_job_url = [] 
nextpages = [] 
filtered_job_links = [] 
http_flinks = [] 
flinks = [] 

def all_next_pages(): 
    pages = prettify.find_all('div', {'class':' row result'}) 
    for next_page in pages: 
     info = next_page.find('a') 
     url = info.get('href') 
     title = info.get('title') 
     print(title,url) 

all_next_pages()