他总是报同一个错误TypeError: can't pickle lxml.etree._Element objects
但是单开一个py文件去测试的时候,我用xpath是可以实现目标爬取的;
相关代码:
#用于request发送请求
import requests
#建立队列
from multiprocessing import Queue
#用正则表达式来抽取网页数据
import re
#采用多线程爬取
import threading
#导入handle_MongoDB里的类
from beans.handle_MongoDB import HandleMongo
#导入json包,将获取的数据进行反序列化
import json
#用xpath语句获取相关信息
from lxml import etree
class Crawl_page(threading.Thread):
#给页码发送请求
def __init__(self,thread_name, page_queue, data_queue):
super(Crawl_page,self).__init__()
self.thread_name = thread_name
self.page_queue = page_queue
self.data_queue = data_queue
self.header = {
"Host":"movie.douban.com",
"Connection":"keep-alive",
"Upgrade-Insecure-Requests":"1",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
"Sec-Fetch-Dest":"document",
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Sec-Fetch-Site":"none",
"Sec-Fetch-Mode":"navigate",
# "Accept-Encoding":"gzip, deflate, br",
"Accept-Language":"zh-CN,zh;q=0.9",
"Cookie":"bid=HEHQSb85oQ8; __utmz=30149280.1616629462.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmz=223695111.1616629462.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __yadk_uid=i8PWIlHWOw4jkmP11axZb700EM8Ag2wR; ll='108296'; __gads=ID=e5b7c149b81191ee-22b739aad3c6009e:T=1616629491:RT=1616629491:S=ALNI_MZR2aPfaKyfEoBSa41HnHAIqbI89A; _vwo_uuid_v2=DA91F4FCF55C0C1DA17D647A3F64E9DF4|ff20e888130656a32c64f9b3958d18b5; ap_v=0,6.0; _pk_ses.100001.4cf6=*; __utma=30149280.354710368.1616629462.1616629462.1616813240.2; __utma=223695111.1923931149.1616629462.1616629462.1616813240.2; __utmb=223695111.0.10.1616813240; __utmt_douban=1; __utmb=30149280.7.10.1616813240; _pk_id.100001.4cf6=f6abb0ee3bec103d.1616629462.2.1616815625.1616629731."
}
def run(self):
#多线程的启动方法
print("当前启动的线程为{}".format(self.thread_name))
#使用no_wait就相当于将timeout设置为无限小,当队列堵塞get不出来东西就会直接抛异常
#当page_flag 为True的时候,终止对page_queue的get
while not page_flag:
try:
page = self.page_queue.get_nowait()
except Exception as e:
pass
else:
# 因为不同页面网页只会变化同一个url中的某一处的数字,可以用page的值去格式化每一次的page_url
count = (int(page)-1)*25
page_url = "https://movie.douban.com/top250?start={}&filter=".format(count)
print("当前请求的URL为:{}".format(page_url))
response = requests.get(url=page_url, headers=self.header)
response.encoding = "utf-8"
html = etree.HTML(response.text)
self.data_queue.put(html)
class Crawl_html(threading.Thread):
#处理页码返回的数据
def __init__(self, thread_name, data_queue, lock):
super(Crawl_html, self).__init__()
self.thread_name = thread_name
self.data_queue = data_queue
self.lock = lock
def run(self):
#多线程的启动方法
print("当前处理文本任务的线程为{}".format(self.thread_name))
#使用no_wait就相当于将timeout设置为无限小,当队列堵塞get不出来东西就会直接抛异常
#当data_flag 为True的时候,终止对data_queue的get
while not data_flag:
try:
html = self.data_queue.get_nowait()
except Exception as e:
pass
else:
# 正常情况下,将每次get到的text数据用自定义函数parse()处理
result = self.parse(html)
with self.lock:
HandleMongo().insert_data(result)
def parse(self,html):
movie_name = html.xpath("//span[@class='title'][1]/text()")
# print(movie_name)
actors_information_raw = html.xpath("//div[@class='bd']/p[1]/text()")
actors_information = []
for i in range(0, len(actors_information_raw)):
if int(i) % 2 == 0:
actors_information.append(actors_information_raw[i].strip())
# print(actors_information)
score = html.xpath("//span[@property='v:average']/text()")
# print(score)
evaluate = html.xpath("//div[@class='star']/span[last()]/text()")
# print(evaluate)
describe = html.xpath("//p[@class='quote']/span/text()")
# print(describe)
data = []
for i in range(0,len(movie_name)):
son_data = {"movie_name":movie_name[i], "actors_information":actors_information[i], "score":score[i], "evaluate":evaluate[i], "describe":describe[i]}
data.append(son_data)
return data
page_flag = False
data_flag = False
def main():
# 页码队列
page_queue = Queue()
# 网页数据队列
data_queue = Queue()
# 为了防止多线程并发,先创建一个锁
lock = threading.Lock()
# 向page_queue中存储页码
for page in range(1, 10):
page_queue.put(page)
# qsize()用于获取队列长度
print("当前页码队列中储存的页码总量为{}".format(page_queue.qsize()))
crawlist = ["页码处理线程1号", "页码处理线程2号", "页码处理线程3号"]
# 存储处理页码的线程
page_thread_list = []
for thread_name_page in crawlist:
# 引用Crawl_page函数,创建线程
thread_page = Crawl_page(thread_name=thread_name_page, page_queue=page_queue, data_queue=data_queue)
# 启动线程
thread_page.start()
# 将创建好的线程导入列表中,方便后面使用这些线程(释放线程)
page_thread_list.append(thread_page)
# 处理页码线程的停止
global page_flag
# 当page_queue不为空时,正常运行;当page_queue为空,page_flag变为True,从而让run()函数停止运行
while not page_queue.empty():
pass
page_flag = True
# 释放处理页码的线程
for thread_page_join in page_thread_list:
thread_page_join.join()
print(thread_page_join.thread_name, "处理结束")
parselist = ["文本处理线程1号", "文本处理线程2号", "文本处理线程3号"]
# 存储处理文本线程
parse_thread_list = []
for thread_name_parse in parselist:
# 引用Crawl_parse函数,创建线程
thread_parse = Crawl_html(thread_name=thread_name_parse, data_queue=data_queue, lock=lock)
# 启动线程
thread_parse.start()
# 将创建好的线程导入列表中,方便后面使用这些线程(释放线程)
parse_thread_list.append(thread_parse)
# 处理文本线程的停止
global data_flag
# 当parse_queue不为空时,正常运行;当parse_queue为空,data_flag变为True,从而让run()函数停止运行
while not data_queue.empty():
pass
data_flag = True
# 释放处理文本的线程
for thread_parse_join in parse_thread_list:
thread_parse_join.join()
print(thread_parse_join.thread_name, "处理结束")
if __name__ == "__main__":
main()
12
收起
正在回答 回答被采纳积分+1
1回答
4.入门主流框架Scrapy与爬虫项目实战
- 参与学习 人
- 提交作业 107 份
- 解答问题 1672 个
Python最广为人知的应用就是爬虫了,有趣且酷的爬虫技能并没有那么遥远,本阶段带你学会利用主流Scrapy框架完成爬取招聘网站和二手车网站的项目实战。
了解课程
恭喜解决一个难题,获得1积分~
来为老师/同学的回答评分吧
0 星