老师,爬取51job网站python岗位信息,不会做,可以给正确答案吗
老师,项目作业爬取51job网站python岗位信息,不会做,可以给正确答案吗
7
收起
正在回答
2回答
同学,你好!同学作业修改后的代码如下:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
class Job51Crawler:
def __init__(self):
self.driver = webdriver.Chrome()
self.wait = WebDriverWait(self.driver, 10)
self.base_url = "https://search.51job.com/list/000000,000000,0000,00,9,99,+,2,1.html"
self.jobs_data = []
def search_jobs(self):
"""打开网站并搜索Python工程师岗位"""
# try:
self.driver.get(self.base_url)
# 等待搜索框加载完成
# search_input = self.wait.until(
# EC.presence_of_element_located((By.ID, "kwdselectid"))
# )
search_input = self.wait.until(
EC.presence_of_element_located((By.ID, "keywordInput"))
)
# 输入搜索关键词
# search_input.clear()
search_input.send_keys("python工程师")
# search_input.send_keys(Keys.RETURN)
# 点击搜索按钮
time.sleep(5)
search_btn = self.wait.until(
EC.element_to_be_clickable((By.ID, "search_btn"))
)
search_btn.click()
# 等待搜索结果加载
time.sleep(5)
self.parse_pages()
# except Exception as e:
# print(f"搜索过程中出现错误: {e}")
def parse_pages(self):
"""解析所有页面的数据"""
page_num = 1
while True:
print(f"正在爬取第 {page_num} 页...")
try:
self.wait.until(
# EC.presence_of_element_located((By.CLASS_NAME, "j_joblist"))
EC.presence_of_element_located((By.CLASS_NAME, "joblist"))
)
except TimeoutException:
print("等待职位列表超时")
break
# 获取当前页面的HTML内容
content = self.driver.page_source
self.parse_html(content)
# 尝试翻页
try:
#next_page_btn = self.driver.find_element(By.CLASS_NAME, "next")
next_page_btn = self.driver.find_element(By.CLASS_NAME, "btn-next")
if "disable" in next_page_btn.get_attribute("class"):
print("已到达最后一页")
break
next_page_btn.click()
page_num += 1
# 等待页面加载
time.sleep(3)
except (NoSuchElementException, TimeoutException):
print("无法找到下一页按钮或已到达最后一页")
break
def parse_html(self, content):
"""解析HTML内容,提取职位信息"""
# 这里使用driver直接查找元素,因为我们已经等待页面加载完成
try:
#job_items = self.driver.find_elements(By.CLASS_NAME, "j_joblist")[0].find_elements(By.CLASS_NAME, "j_job")
job_items = self.driver.find_elements(By.CLASS_NAME, "joblist-item-job-wrapper")
for job_item in job_items:
job_name = job_item.find_element(By.CLASS_NAME, "jname").text.strip() or "暂无数据"
# # 提取薪资
try:
salary = job_item.find_element(By.CLASS_NAME, "sal").text.strip()
except NoSuchElementException:
salary = "暂无数据"
# # 提取工作地点和要求
try:
job_info = job_item.find_element(By.CLASS_NAME, "shrink-0").text.strip()
except NoSuchElementException:
job_info = "暂无数据"
# # 提取福利待遇
try:
benefits = job_item.find_element(By.CLASS_NAME, "tags").text.strip()
except NoSuchElementException:
benefits = "暂无数据"
# # 保存数据
job_data = {
"岗位名称": job_name,
"薪资": salary,
"工作信息": job_info,
"福利待遇": benefits
}
self.jobs_data.append(job_data)
except Exception as e:
print(f"解析页面时出错: {e}")
def save_to_file(self):
"""将数据保存到本地文件"""
try:
with open("a.txt", "w", encoding="utf-8") as f:
for i, job in enumerate(self.jobs_data, 1):
f.write(f"职位 {i}:\n")
f.write(f" 岗位名称: {job['岗位名称']}\n")
f.write(f" 薪资: {job['薪资']}\n")
f.write(f" 工作信息: {job['工作信息']}\n")
f.write(f" 福利待遇: {job['福利待遇']}\n")
f.write("\n")
print(f"成功保存 {len(self.jobs_data)} 条职位信息到 a.txt")
except Exception as e:
print(f"保存文件时出错: {e}")
def run(self):
"""运行爬虫"""
try:
self.search_jobs()
self.save_to_file()
finally:
self.driver.quit()
if __name__ == "__main__":
crawler = Job51Crawler()
crawler.run()祝学习愉快~
恭喜解决一个难题,获得1积分~
来为老师/同学的回答评分吧
0 星