项目作业-异步执行速度
相关代码:
#!/usr/bin/env python3 # -*- coding=utf-8 -*- import asyncio import aiohttp import re from lxml import etree import pymongo # 请求并将数据丢到html_queue的队列中 async def request(session, html_queue, sem, url_queue): # 传入处理协程数量 async with sem: # 取出url_queue的数据 url = await url_queue.get() # 异步的方式去request豆瓣的电影页面的HTML async with session.get(url) as resp: data = await resp.text() # 将数据丢入到html_queue的队列中 await html_queue.put(data) # 将html_queue队列中的数据取出,处理后写入到DB中 async def handle(html_queue, lock, mycollection): # 将队列中的数据取出 html_text = await html_queue.get() # 将数据丢到etree中处理 html = etree.HTML(html_text) for li_number in range(1, 26): #################### 整理数据 #################### # 整理相关电影资讯 movie_path = html.xpath( f'// *[ @ id = "content"] / div / div[1] / ol / li[{li_number}] / div / div[2] / div[1] / a/span/text()') movie_name_list = [] for name in movie_path: # 将电影名称的空白消除 movie_name_list.append(re.compile(r'(\s)+/(\s)+').sub('/', name)) movie_name = ''.join(list(filter(None, movie_name_list))) actors_path = html.xpath(f'//*[@id="content"]/div/div[1]/ol/li[{li_number}]/div/div[2]/div[2]/p/text()') ac_list = [] for ac in actors_path: # 将演员资讯之间的空白消除,以及换行消除 ac_result = re.compile(r'(\s)/(\s)').sub('/', re.compile(r'[\n]|[\s]{2,}').sub('', ac)) ac_list.append(ac_result) actors_information = ''.join(list(filter(None, ac_list))) score_path = html.xpath( f'//*[@id="content"]/div/div[1]/ol/li[{li_number}]/div/div[2]/div[2]/div/span[2]/text()') score = ''.join(score_path) evaluate_path = html.xpath( f'//*[@id="content"]/div/div[1]/ol/li[{li_number}]/div/div[2]/div[2]/div/span[4]/text()') evaluate = ''.join(evaluate_path) describe_path = html.xpath( f'//*[@id="content"]/div/div[1]/ol/li[{li_number}]/div/div[2]/div[2]/p[2]/span/text()') describe = ''.join(describe_path) # 创建一个字典将资料都放入其中 info = { 'movie_name': movie_name, 'actors_information': actors_information, 'score': score, 'evaluate': evaluate, 'describe': describe } # 将数据丢到DB中 async with lock: mycollection.insert_one(info) # 定义异步主程序 async def main(): import time start_time = time.time() # 定义协程池数量 sem = asyncio.Semaphore(1) header = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", "Host": "movie.douban.com", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" } # 生成url数据,并丢到url_queue队列中 url_queue = asyncio.Queue() for page in range(0,226, 25): url = f"https://movie.douban.com/top250?start={page}" await url_queue.put(url) # 调用request异步函数 html_queue = asyncio.Queue() async with aiohttp.ClientSession(headers=header) as session: while not url_queue.empty(): await request(session, html_queue, sem, url_queue) # 实例化mongo client m_client = pymongo.MongoClient('mongodb://account:password@172.16.1.10:27017') mydb = m_client['movie'] mycollection = mydb['movie_info'] # 生成lock,并调用handle异步函数 lock = asyncio.Lock() while not html_queue.empty(): await handle(html_queue, lock, mycollection) end_time = time.time() print(end_time-start_time) if __name__ == '__main__': # 开始循环 loop = asyncio.get_event_loop() loop.run_until_complete(main())
问题描述:
你好,为什么通过异步爬取会比多线程还要慢?
我的理解是每次异步request之后,就会马上换下一个request,因此并不会阻塞IO
而多线程则是通过大量的线程去减少阻塞IO的时间
我认为两者的时间在这个作业里面应该是差不多的,但是实际运行异步所消耗的时间约为多线程的两倍,让我不知道是不是异步哪里的逻辑出了问题,可能实际上还是有阻塞的部份我没看出来
因此想请教一下
29
收起
正在回答 回答被采纳积分+1
Python全能工程师
- 参与学习 人
- 提交作业 16247 份
- 解答问题 4470 个
全新版本覆盖5大热门就业方向:Web全栈、爬虫、数据分析、软件测试、人工智能,零基础进击Python全能型工程师,从大厂挑人到我挑大厂,诱人薪资在前方!
了解课程
恭喜解决一个难题,获得1积分~
来为老师/同学的回答评分吧
0 星