我试图通过Zillow页面循环并提取数据。我知道该URL是与每个迭代后的新页面数更新,但提取的数据是因为如果该URL仍然是第1页的通过网页循环访问webscrape数据
import selenium
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
import pandas as pd
next_page='https://www.zillow.com/romeo-mi-48065/real-estate-agent-reviews/'
num_data1=pd.DataFrame(columns=['name','number'])
browser=webdriver.Chrome()
browser.get('https://www.zillow.com/romeo-mi-48065/real-estate-agent-reviews/')
while True:
page=requests.get(next_page)
contents=page.content
soup = BeautifulSoup(contents, 'html.parser')
number_p=soup.find_all('p', attrs={'class':'ldb-phone-number'},text=True)
name_p=soup.find_all('p', attrs={'class':'ldb-contact-name'},text=True)
number_p=pd.DataFrame(number_p,columns=['number'])
name_p=pd.DataFrame(name_p,columns=['name'])
num_data=number_p['number'].apply(lambda x: x.text.strip())
nam_data=name_p['name'].apply(lambda x: x.text.strip())
number_df=pd.DataFrame(num_data,columns=['number'])
name_df=pd.DataFrame(nam_data,columns=['name'])
num_data0=pd.concat([number_df,name_df],axis=1)
num_data1=num_data1.append(num_data0)
try:
button=browser.find_element_by_css_selector('.zsg-pagination>li.zsg-pagination-next>a').click()
next_page=str(browser.current_url)
except IndexError:
break
非常好,这工作完美!谢谢!! – ashkrelja