请问Selenium能采用多线程吗?
问题描述:
请老师帮我看看怎么回事?代码中采用几个进程,爬取的数据就重复几次,且重复数据均是第一页~~
相关代码:
import time import threading import queue from selenium import webdriver from selenium.webdriver.chrome.options import Options import pymongo # 设置浏览器参数 chrome_options = Options() # 设置浏览器的无头浏览器,无界面,浏览器将不提供界面,linux操作系统无界面情况下就可以运行了 chrome_options.add_argument("--headless") # 结果devtoolsactiveport文件不存在的报错 chrome_options.add_argument("--no-sandbox") # 官方推荐的关闭选项,规避一些BUG chrome_options.add_argument("--disable-gpu") chrome_options.add_argument('--disable-blink-features=AutomationControlled') chrome_options.add_argument('User-Agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"') chrome_options.add_argument(r"user-data-dir=C:\Users\Administrator\AppData\Local\Google\Chrome\User Data") # 链接数据库 client = pymongo.MongoClient(host='linzpao.zjjy.vip',port=27017,username='linzpao-admin', password='linzpao123') mydb = client["linzpao"] mycollection = mydb['fy'] mycollection.delete_many({}) q_url = queue.Queue() q_parse_student = queue.Queue() def parse_student(name): while True: if q_parse_student.empty(): break # print(name + "开始") content = q_parse_student.get(block=False) student_web = webdriver.Chrome(options=chrome_options) student_web.maximize_window() student_web.get(content) student_id = student_web.find_element_by_xpath('//div[3]/div/div[2]/div/table/tbody/tr[2]/td[2]').text student_name = student_web.find_element_by_xpath('//div[3]/div/div[2]/div/table/tbody/tr[3]/td[2]').text student_sex = student_web.find_element_by_xpath('//div[3]/div/div[2]/div/table/tbody/tr[4]/td[2]').text student_college = student_web.find_element_by_xpath('//div[3]/div/div[2]/div/table/tbody/tr[5]/td[2]').text student_class = student_web.find_element_by_xpath('//div[3]/div/div[2]/div/table/tbody/tr[6]/td[2]').text student_phone = student_web.find_element_by_xpath('//div[3]/div/div[2]/div/table/tbody/tr[7]/td[2]').text student_info ={ 'student_id':student_id,'student_name':student_name,'student_sex':student_sex, 'student_class':student_class,'student_phone':student_phone,'student_college':student_college } print(student_info) mycollection.insert_one(student_info) # print(name + "结束") student_web.quit() def main(name): while True: if q_url.empty(): break print(name+"开始") page = q_url.get(block=False) web = webdriver.Chrome(options=chrome_options) web.maximize_window() web.get(page) students_id = web.find_elements_by_xpath('//tr[position()>1]//td[2]/a') for url in students_id: student_url =url.get_attribute('href') print(student_url) q_parse_student.put(student_url) print(name+"结束") web.quit() # parse_student(student_url) if __name__ == '__main__': for i in range(1,3): url = 'https://fyjy.eduw.cn/sysadmins/stu/stu_list.aspx?fst=186&snd=338&thd=&order=&s=0&pagecount=&xh=&xm=&yx=&bj=&sf=&cs=&sslh=&ssch=&page={}'.format(i) print(url) q_url.put(url) time.sleep(2) page_spider_threadname_list = ["列表页采集线程1号","列表页采集线程2号"] page_spider_list = [] for thread_name in page_spider_threadname_list: thread= threading.Thread(target=main,args=(thread_name,)) thread.start() page_spider_list.append(thread) for thread in page_spider_list: if thread.is_alive(): thread.join() detail_spider_threadname_list = ["详情页采集线程1号","详情页采集线程2号"] detail_spider_list = [] for thread_name in detail_spider_threadname_list: thread = threading.Thread(target=parse_student, args=(thread_name,)) thread.start() detail_spider_list.append(thread) for thread in detail_spider_list: if thread.is_alive(): thread.join()
相关截图:
11
收起
正在回答 回答被采纳积分+1
1回答
Python全能工程师
- 参与学习 人
- 提交作业 16233 份
- 解答问题 4470 个
全新版本覆盖5大热门就业方向:Web全栈、爬虫、数据分析、软件测试、人工智能,零基础进击Python全能型工程师,从大厂挑人到我挑大厂,诱人薪资在前方!
了解课程
恭喜解决一个难题,获得1积分~
来为老师/同学的回答评分吧
0 星