项目作业-异步执行速度

相关代码：

#!/usr/bin/env python3
# -*- coding=utf-8 -*-
import asyncio
import aiohttp
import re
from lxml import etree
import pymongo

# 请求并将数据丢到html_queue的队列中
async def request(session, html_queue, sem, url_queue):
    # 传入处理协程数量
    async with sem:
        # 取出url_queue的数据
        url = await url_queue.get()
        # 异步的方式去request豆瓣的电影页面的HTML
        async with session.get(url) as resp:
            data = await resp.text()
            # 将数据丢入到html_queue的队列中
            await html_queue.put(data)

# 将html_queue队列中的数据取出，处理后写入到DB中
async def handle(html_queue, lock, mycollection):
    # 将队列中的数据取出
    html_text = await html_queue.get()

    # 将数据丢到etree中处理
    html = etree.HTML(html_text)

    for li_number in range(1, 26):

        #################### 整理数据  ####################
        # 整理相关电影资讯
        movie_path = html.xpath(
            f'// *[ @ id = "content"] / div / div[1] / ol / li[{li_number}] / div / div[2] / div[1] / a/span/text()')
        movie_name_list = []
        for name in movie_path:
            # 将电影名称的空白消除
            movie_name_list.append(re.compile(r'(\s)+/(\s)+').sub('/', name))
        movie_name = ''.join(list(filter(None, movie_name_list)))

        actors_path = html.xpath(f'//*[@id="content"]/div/div[1]/ol/li[{li_number}]/div/div[2]/div[2]/p/text()')
        ac_list = []
        for ac in actors_path:
            # 将演员资讯之间的空白消除，以及换行消除
            ac_result = re.compile(r'(\s)/(\s)').sub('/', re.compile(r'[\n]|[\s]{2,}').sub('', ac))
            ac_list.append(ac_result)
        actors_information = ''.join(list(filter(None, ac_list)))

        score_path = html.xpath(
            f'//*[@id="content"]/div/div[1]/ol/li[{li_number}]/div/div[2]/div[2]/div/span[2]/text()')
        score = ''.join(score_path)

        evaluate_path = html.xpath(
            f'//*[@id="content"]/div/div[1]/ol/li[{li_number}]/div/div[2]/div[2]/div/span[4]/text()')
        evaluate = ''.join(evaluate_path)

        describe_path = html.xpath(
            f'//*[@id="content"]/div/div[1]/ol/li[{li_number}]/div/div[2]/div[2]/p[2]/span/text()')
        describe = ''.join(describe_path)

        # 创建一个字典将资料都放入其中
        info = {
            'movie_name': movie_name,
            'actors_information': actors_information,
            'score': score,
            'evaluate': evaluate,
            'describe': describe
        }

        # 将数据丢到DB中
        async with lock:
            mycollection.insert_one(info)

# 定义异步主程序
async def main():
    import time
    start_time = time.time()
    # 定义协程池数量
    sem = asyncio.Semaphore(1)

    header = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Connection": "keep-alive",
        "Host": "movie.douban.com",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
    }

    # 生成url数据，并丢到url_queue队列中
    url_queue = asyncio.Queue()
    for page in range(0,226, 25):
        url = f"https://movie.douban.com/top250?start={page}"
        await url_queue.put(url)

    # 调用request异步函数
    html_queue = asyncio.Queue()
    async with aiohttp.ClientSession(headers=header) as session:
        while not url_queue.empty():
            await request(session, html_queue, sem, url_queue)

    # 实例化mongo client
    m_client = pymongo.MongoClient('mongodb://account:password@172.16.1.10:27017')
    mydb = m_client['movie']
    mycollection = mydb['movie_info']

    # 生成lock，并调用handle异步函数
    lock = asyncio.Lock()
    while not html_queue.empty():
        await handle(html_queue, lock, mycollection)

    end_time = time.time()
    print(end_time-start_time)

if __name__ == '__main__':
    # 开始循环
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())

问题描述：

你好，为什么通过异步爬取会比多线程还要慢?

我的理解是每次异步request之后，就会马上换下一个request，因此并不会阻塞IO

而多线程则是通过大量的线程去减少阻塞IO的时间

我认为两者的时间在这个作业里面应该是差不多的，但是实际运行异步所消耗的时间约为多线程的两倍，让我不知道是不是异步哪里的逻辑出了问题，可能实际上还是有阻塞的部份我没看出来

因此想请教一下

慕的地2111885 2022-07-03

源自：网络爬虫进阶——项目实战 2-2 项目作业

收起

1回答

好帮手慕小猿 2022-07-04 14:25:31

同学，你好！同学可以增加下协程池的数量，提高爬取速度。

祝学习愉快~

收起回答

提问者慕的地2111885 #1

你好，增加协程池的数量对程式运行的速度没有帮助呢

2022-07-04 14:58:42
好帮手慕小猿回复提问者慕的地2111885 #2

同学，你好！老师这边几次运行速度都是差不多的。如果频繁请求，请求次数过多过快，可能会触发目标服务器的反爬措施，也会影响返回数据的速度，也会和实现功能的代码方式有关。祝学习愉快~

2022-07-04 18:01:12
提问者慕的地2111885 回复好帮手慕小猿 #3

你好，也就是说这个异步是符合异步的逻辑对吗? 因为我不确定是否真的有异步的行为，怕其实还是有阻塞IO的情况存在，有什么好方法能够检查是否有阻塞IO呢?

2022-07-05 13:50:25