他总是报同一个错误TypeError: can't pickle lxml.etree._Element objects

但是单开一个py文件去测试的时候，我用xpath是可以实现目标爬取的；

相关代码：

#用于request发送请求
import requests
#建立队列
from multiprocessing import Queue
#用正则表达式来抽取网页数据
import re
#采用多线程爬取
import threading
#导入handle_MongoDB里的类
from beans.handle_MongoDB import HandleMongo
#导入json包，将获取的数据进行反序列化
import json
#用xpath语句获取相关信息
from lxml import etree

class Crawl_page(threading.Thread):
    #给页码发送请求
    def __init__(self,thread_name, page_queue, data_queue):
        super(Crawl_page,self).__init__()
        self.thread_name = thread_name
        self.page_queue = page_queue
        self.data_queue = data_queue
        self.header = {
            "Host":"movie.douban.com",
            "Connection":"keep-alive",
            "Upgrade-Insecure-Requests":"1",
            "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
            "Sec-Fetch-Dest":"document",
            "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Sec-Fetch-Site":"none",
            "Sec-Fetch-Mode":"navigate",
            # "Accept-Encoding":"gzip, deflate, br",
            "Accept-Language":"zh-CN,zh;q=0.9",
            "Cookie":"bid=HEHQSb85oQ8; __utmz=30149280.1616629462.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmz=223695111.1616629462.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __yadk_uid=i8PWIlHWOw4jkmP11axZb700EM8Ag2wR; ll='108296'; __gads=ID=e5b7c149b81191ee-22b739aad3c6009e:T=1616629491:RT=1616629491:S=ALNI_MZR2aPfaKyfEoBSa41HnHAIqbI89A; _vwo_uuid_v2=DA91F4FCF55C0C1DA17D647A3F64E9DF4|ff20e888130656a32c64f9b3958d18b5; ap_v=0,6.0; _pk_ses.100001.4cf6=*; __utma=30149280.354710368.1616629462.1616629462.1616813240.2; __utma=223695111.1923931149.1616629462.1616629462.1616813240.2; __utmb=223695111.0.10.1616813240; __utmt_douban=1; __utmb=30149280.7.10.1616813240; _pk_id.100001.4cf6=f6abb0ee3bec103d.1616629462.2.1616815625.1616629731."
        }

    def run(self):
        #多线程的启动方法
        print("当前启动的线程为{}".format(self.thread_name))
        #使用no_wait就相当于将timeout设置为无限小，当队列堵塞get不出来东西就会直接抛异常
        #当page_flag 为True的时候，终止对page_queue的get
        while not page_flag:
            try:
                page = self.page_queue.get_nowait()
            except Exception as e:
                pass
            else:
                # 因为不同页面网页只会变化同一个url中的某一处的数字，可以用page的值去格式化每一次的page_url
                count = (int(page)-1)*25
                page_url = "https://movie.douban.com/top250?start={}&filter=".format(count)
                print("当前请求的URL为：{}".format(page_url))
                response = requests.get(url=page_url, headers=self.header)
                response.encoding = "utf-8"
                html = etree.HTML(response.text)
                self.data_queue.put(html)


class Crawl_html(threading.Thread):
    #处理页码返回的数据
    def __init__(self, thread_name, data_queue, lock):
        super(Crawl_html, self).__init__()
        self.thread_name = thread_name
        self.data_queue = data_queue
        self.lock = lock

    def run(self):
        #多线程的启动方法
        print("当前处理文本任务的线程为{}".format(self.thread_name))
        #使用no_wait就相当于将timeout设置为无限小，当队列堵塞get不出来东西就会直接抛异常
        #当data_flag 为True的时候，终止对data_queue的get
        while not data_flag:
            try:
                html = self.data_queue.get_nowait()
            except Exception as e:
                pass
            else:
                # 正常情况下，将每次get到的text数据用自定义函数parse()处理
                result = self.parse(html)
                with self.lock:
                    HandleMongo().insert_data(result)


    def parse(self,html):
        movie_name = html.xpath("//span[@class='title'][1]/text()")
        # print(movie_name)
        actors_information_raw = html.xpath("//div[@class='bd']/p[1]/text()")
        actors_information = []
        for i in range(0, len(actors_information_raw)):
            if int(i) % 2 == 0:
                actors_information.append(actors_information_raw[i].strip())
        # print(actors_information)
        score = html.xpath("//span[@property='v:average']/text()")
        # print(score)
        evaluate = html.xpath("//div[@class='star']/span[last()]/text()")
        # print(evaluate)
        describe = html.xpath("//p[@class='quote']/span/text()")
        # print(describe)
        data = []
        for i in range(0,len(movie_name)):
            son_data = {"movie_name":movie_name[i], "actors_information":actors_information[i], "score":score[i], "evaluate":evaluate[i], "describe":describe[i]}
            data.append(son_data)
            return data


page_flag = False
data_flag = False
def main():
    # 页码队列
    page_queue = Queue()
    # 网页数据队列
    data_queue = Queue()

    # 为了防止多线程并发，先创建一个锁
    lock = threading.Lock()
    # 向page_queue中存储页码
    for page in range(1, 10):
        page_queue.put(page)
    # qsize()用于获取队列长度
    print("当前页码队列中储存的页码总量为{}".format(page_queue.qsize()))

    crawlist = ["页码处理线程1号", "页码处理线程2号", "页码处理线程3号"]
    # 存储处理页码的线程
    page_thread_list = []
    for thread_name_page in crawlist:
        # 引用Crawl_page函数，创建线程
        thread_page = Crawl_page(thread_name=thread_name_page, page_queue=page_queue, data_queue=data_queue)
        # 启动线程
        thread_page.start()
        # 将创建好的线程导入列表中，方便后面使用这些线程(释放线程)
        page_thread_list.append(thread_page)
    # 处理页码线程的停止
    global page_flag
    # 当page_queue不为空时，正常运行；当page_queue为空，page_flag变为True,从而让run()函数停止运行
    while not page_queue.empty():
        pass
    page_flag = True
    # 释放处理页码的线程
    for thread_page_join in page_thread_list:
        thread_page_join.join()
        print(thread_page_join.thread_name, "处理结束")

    parselist = ["文本处理线程1号", "文本处理线程2号", "文本处理线程3号"]
    # 存储处理文本线程
    parse_thread_list = []
    for thread_name_parse in parselist:
        # 引用Crawl_parse函数，创建线程
        thread_parse = Crawl_html(thread_name=thread_name_parse, data_queue=data_queue, lock=lock)
        # 启动线程
        thread_parse.start()
        # 将创建好的线程导入列表中，方便后面使用这些线程(释放线程)
        parse_thread_list.append(thread_parse)
    # 处理文本线程的停止
    global data_flag
    # 当parse_queue不为空时，正常运行；当parse_queue为空，data_flag变为True,从而让run()函数停止运行
    while not data_queue.empty():
        pass

    data_flag = True
    # 释放处理文本的线程
    for thread_parse_join in parse_thread_list:
        thread_parse_join.join()
        print(thread_parse_join.thread_name, "处理结束")


if __name__ == "__main__":
    main()

TeacherZhe 2021-04-03

源自：爬虫进阶与实战 8-3 项目作业

收起

1回答

时间， 2021-04-03 15:40:48

同学，你好！同学使用的是进程中的队列，在使用多线程往队列中存储lxml.etree._Element类型数据时报错了。同学可以使用queue中的Queue队列解决该问题。

祝学习愉快！

收起回答

提问者 TeacherZhe #1

那queue和Queue到底有什么区别啊？

2021-04-03 15:41:54
提问者 TeacherZhe #2

哦不，queue和multiprocessing里的这个Queue有什么区别

2021-04-03 15:45:27
时间，回复提问者 TeacherZhe #3

同学，你好！
1、from queue import Queue：是普通的队列模式，类似于普通列表，先进先出模式，可以应用于线程间的消息队列
2、from multiprocessing import Queue：是进程中的队列，用于解决多个子进程间的通信问题，有时在线程中也可以使用的。
祝学习愉快！

2021-04-03 16:34:19