请问Selenium能采用多线程吗?
问题描述:
请老师帮我看看怎么回事?代码中采用几个进程,爬取的数据就重复几次,且重复数据均是第一页~~
相关代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 | import time import threading import queue from selenium import webdriver from selenium.webdriver.chrome.options import Options import pymongo # 设置浏览器参数 chrome_options = Options() # 设置浏览器的无头浏览器,无界面,浏览器将不提供界面,linux操作系统无界面情况下就可以运行了 chrome_options.add_argument( "--headless" ) # 结果devtoolsactiveport文件不存在的报错 chrome_options.add_argument( "--no-sandbox" ) # 官方推荐的关闭选项,规避一些BUG chrome_options.add_argument( "--disable-gpu" ) chrome_options.add_argument( '--disable-blink-features=AutomationControlled' ) chrome_options.add_argument( 'User-Agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"' ) chrome_options.add_argument(r "user-data-dir=C:\Users\Administrator\AppData\Local\Google\Chrome\User Data" ) # 链接数据库 client = pymongo.MongoClient(host = 'linzpao.zjjy.vip' ,port = 27017 ,username = 'linzpao-admin' , password = 'linzpao123' ) mydb = client[ "linzpao" ] mycollection = mydb[ 'fy' ] mycollection.delete_many({}) q_url = queue.Queue() q_parse_student = queue.Queue() def parse_student(name): while True : if q_parse_student.empty(): break # print(name + "开始") content = q_parse_student.get(block = False ) student_web = webdriver.Chrome(options = chrome_options) student_web.maximize_window() student_web.get(content) student_id = student_web.find_element_by_xpath( '//div[3]/div/div[2]/div/table/tbody/tr[2]/td[2]' ).text student_name = student_web.find_element_by_xpath( '//div[3]/div/div[2]/div/table/tbody/tr[3]/td[2]' ).text student_sex = student_web.find_element_by_xpath( '//div[3]/div/div[2]/div/table/tbody/tr[4]/td[2]' ).text student_college = student_web.find_element_by_xpath( '//div[3]/div/div[2]/div/table/tbody/tr[5]/td[2]' ).text student_class = student_web.find_element_by_xpath( '//div[3]/div/div[2]/div/table/tbody/tr[6]/td[2]' ).text student_phone = student_web.find_element_by_xpath( '//div[3]/div/div[2]/div/table/tbody/tr[7]/td[2]' ).text student_info = { 'student_id' :student_id, 'student_name' :student_name, 'student_sex' :student_sex, 'student_class' :student_class, 'student_phone' :student_phone, 'student_college' :student_college } print (student_info) mycollection.insert_one(student_info) # print(name + "结束") student_web.quit() def main(name): while True : if q_url.empty(): break print (name + "开始" ) page = q_url.get(block = False ) web = webdriver.Chrome(options = chrome_options) web.maximize_window() web.get(page) students_id = web.find_elements_by_xpath( '//tr[position()>1]//td[2]/a' ) for url in students_id: student_url = url.get_attribute( 'href' ) print (student_url) q_parse_student.put(student_url) print (name + "结束" ) web.quit() # parse_student(student_url) if __name__ = = '__main__' : for i in range ( 1 , 3 ): url = 'https://fyjy.eduw.cn/sysadmins/stu/stu_list.aspx?fst=186&snd=338&thd=&order=&s=0&pagecount=&xh=&xm=&yx=&bj=&sf=&cs=&sslh=&ssch=&page={}' . format (i) print (url) q_url.put(url) time.sleep( 2 ) page_spider_threadname_list = [ "列表页采集线程1号" , "列表页采集线程2号" ] page_spider_list = [] for thread_name in page_spider_threadname_list: thread = threading.Thread(target = main,args = (thread_name,)) thread.start() page_spider_list.append(thread) for thread in page_spider_list: if thread.is_alive(): thread.join() detail_spider_threadname_list = [ "详情页采集线程1号" , "详情页采集线程2号" ] detail_spider_list = [] for thread_name in detail_spider_threadname_list: thread = threading.Thread(target = parse_student, args = (thread_name,)) thread.start() detail_spider_list.append(thread) for thread in detail_spider_list: if thread.is_alive(): thread.join() |
相关截图:
11
收起
正在回答 回答被采纳积分+1
1回答
Python全能工程师
- 参与学习 人
- 提交作业 16330 份
- 解答问题 4470 个
全新版本覆盖5大热门就业方向:Web全栈、爬虫、数据分析、软件测试、人工智能,零基础进击Python全能型工程师,从大厂挑人到我挑大厂,诱人薪资在前方!
了解课程
恭喜解决一个难题,获得1积分~
来为老师/同学的回答评分吧