关了视频自己写遇到了一点问题求解

# https://search.51job.com/list/150200,000000,0000,00,9,99,python,2,1.html

import requests
import threading
from multiprocessing import Queue
import re
import time

# 页码处理类
class page_url(threading.Thread):
    def __init__(self, thread_name, page_queue, data_queue):
        super(page_url, self).__init__()
        self.thread_name = thread_name
        self.page_queue = page_queue
        self.data_queue = data_queue
        self.header = {
            "Host": "jobs.51job.com",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Sec-Fetch-Site": "none",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-User": "?1",
            "Sec-Fetch-Dest": "document",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "zh-CN,zh;q=0.9"
        }

    def run(self):
        print("线程启动：{0}".format(self.thread_name))
        while not self.page_queue.empty():
            try:
                page = self.page_queue.get_nowait()
            except:
                pass
            else:
                url = "https://search.51job.com/list/150200,000000,0000,00,9,99,python,2,"+str(page)+".html"
                response = requests.get(url=url, headers=self.header)
                self.data_queue.put(response.text)

# 文本处理类
class data_html(threading.Thread):
    def __init__(self, thread_name, data_queue):
        super(data_html, self).__init__()
        self.thread_name = thread_name
        self.data_queue = data_queue
        # self.lock = lock

    def run(self):
        print("线程启动：{0}".format(self.thread_name))
        # 测试data_queue队列的内容，发现是空的，加入time.sleep(10)就不是空的了
        print(self.data_queue.qsize())
        while not self.data_queue.empty():
            try:
                text = self.data_queue.get_nowait()

            except:
                pass
            else:
                data_search = re.compile(r"window\.__SEARCH_RESULT__\s=\s(.*?)</script>")
                data = data_search.search(text)
                job_items = data.group(1)
                # print(job_items)



def main():
    page_queue = Queue()
    data_queue = Queue()
    lock = threading.Lock()
    for page in range(1,11):

        # url = "https://search.51job.com/list/150200,000000,0000,00,9,99,python,2,"+str(page)+".html"
        page_queue.put(page)

    thread_url_name_list = ["页码处理线程1号","页码处理线程2号"]
    thread_url_name_end_list = []

    for thread_url_name in thread_url_name_list:
        thread_page = page_url(thread_name=thread_url_name, page_queue=page_queue, data_queue=data_queue)
        thread_page.start()
        thread_url_name_end_list.append(thread_page)


    while page_queue.empty():
        for url_end in thread_url_name_end_list:
            url_end.join()

    thread_html_name_list = ['html处理线程1号', 'html处理线程2号']
    thread_html_name_end_list = []

    for thread_html_name in thread_html_name_list:
        thread_html = data_html(thread_name=thread_html_name, data_queue=data_queue)
        thread_html.start()
        thread_html_name_end_list.append(thread_html)

    while data_queue.empty():
        for html_end in thread_html_name_end_list:
            html_end.join()

if __name__ == "__main__":
    main()

没有按视频那样设置全局变量page_flag = False 和 data_flag = False 来作为循环开始的条件，取而代之的思路是认为队列不空就循环取。测试的结果页码处理类正常，但是文本处理类发现问题：在从data_queue中get数据之前，需要等待一段时间，否则data_queue没有数据。

问题1：为什么data_queue需要等待一段时间才有数据？

问题2：老师视频中也没看到等待，却可以正常从data_queue中获取数据，这是为什么？是我视频看的不仔细么。。。

问题3：这段代码反复运行的话，有时候会碰到报错：BrokenPipeError: [WinError 109] 管道已结束。

查询是这样解释的：多进程通信用了队列，另一个进程关闭后，再用队列就会报这个错。

没太理解，这里不是线程么，我线程有关了后再用队列么？

游云浮世绘 2021-01-04

源自：爬虫进阶与实战 7-4 实战—自定义文本处理类

收起

2回答

好帮手慕小轩回答被采纳获得+3积分 2021-01-04 19:25:36

同学，你好！
1. 根据同学提供的代码，同学没有加锁，所以多个线程争抢时间片，获取数据速度就会延迟
2. 同学写的是一个死循环，当while循环成立时，才会执行for循环中的代码，挂起等待子线程执行结束，若没有join()，主线程可能会比子线程执行的快，从而主线程结束
加油，祝学习愉快~~~

收起回答

游云浮世绘提问者 #1

可以具体点么，我觉得加不加锁不影响啊，看代码老师加锁的地方就是在往数据库存数据的时候加锁了，这里数据不是从data_queue这个队列里拿的么，拿一条队列里就少一条，下个线程来拿也不会重复拿啊，存的时候应该也不会少存不会多存呀。

2021-01-05 13:27:19
好帮手慕小轩回复提问者游云浮世绘 #2

同学，你好！与加不加锁没影响，是因为开始没有执行join()，会出现子线程没来得及put数据主线程就结束了，应将while去掉，把for循环拿出来，在尝试运行一下，如下图所示：

加油，祝学习愉快~~~

2021-01-05 15:53:40
游云浮世绘提问者回复好帮手慕小轩 #3

这里加锁不加锁有什么区别？禁止了哪个资源的争抢？

2021-01-05 19:41:01