爬虫时出现问题

爬虫时出现问题

import requests
from lxml import etree
import threading
from queue import Queue
import pymongo

#从每页数据中解析出各电影详情页数据
class PageParser(threading.Thread):
def __init__(self,thread_name,page_queue,data_queue):
super(PageParser,self).__init__()
self.thread_name=thread_name
self.page_queue=page_queue
self.data_queue=data_queue

def handle_requests(self,page_url):
header={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
}
response=requests.get(url=page_url,headers=header)
if response.status_code == 200 and response:
return response.text

#处理页面数据
def parse_page(self,content):
html=etree.HTML(content)
page_all_data=html.xpath("//div[@class='article']/ol/li")
for li in page_all_data:
self.data_queue.put(li) # 将解析出的各电影数据放入data_queue队列中
print(li,type(li))

def run(self):
print(f"{self.thread_name} has started")
try:
while not self.page_queue.empty():
page_url=self.page_queue.get()
page_response=self.handle_requests(page_url)
if page_response:
self.parse_page(page_response)
except Exception as e:
print(f"{self.thread_name} met error: {e}")
print(f"{self.thread_name} has ended")


class DataParser(threading.Thread):
def __init__(self,thread_name,data_queue,mongo,lock):
super(DataParser,self).__init__()
self.thread_name=thread_name
self.data_queue=data_queue
self.mongo=mongo
self.lock=lock

def handle_list_one(self,content):
return ''.join(content)

def handle_list_some(self,content):
return '/'.join(content)

def parse_data(self,content):
data={
"movie_name":self.handle_list_some(content.xpath("//div[@class='info']/div[@class='hd']/a/span/text()")),
"actors_information": self.handle_list_some(content.xpath("//div[@class='info']/div[@class='bd']/p[1]/text()")),
"score":self.handle_list_one(content.xpath("//div[@class='info']/div[@class='bd']/div/span[@class='rating_num']/text()")),
"evaluate": self.handle_list_one(content.xpath("//div[@class='info']/div[@class='bd']/div/span[@class='']/text()")),
"describe":self.handle_list_one(content.xpath("//div[@class='info']/div[@class='bd']/p[2]/span/text()"))
}
with self.lock:
self.mongo.insert_one(data)


def run(self):
print(f"{self.thread_name} has started")
try:
while not self.data_queue.empty():
datas=self.data_queue.get()
self.parse_data(datas)
except Exception as e:
print(f"{self.thread_name} met error: {e}")
print(f"{self.thread_name} as ended")

def main():
page_queue=Queue()
for i in range(0,26,25): #TODO:测试后修改
page_url=f"https://movie.douban.com/top250?start={i}&filter="
page_queue.put(page_url)
data_queue=Queue()

page_thread_names=["P1","P2","P3"]
page_thread_recover=[]
for thread_name in page_thread_names:
thread=PageParser(thread_name,page_queue,data_queue)
thread.start()
page_thread_recover.append(thread)
while not page_queue.empty():
pass
for thread in page_thread_recover:
if thread.is_alive():
thread.join()

#建立mongodb的连接
myclient = pymongo.MongoClient("mongodb://127.0.0.1:1112")
mydb = myclient["TOP250"]
mycollection = mydb["movie_info"]

lock=threading.Lock()

data_thread_names = ["DE1", "DE2", "DE3", "DE4", "DE5"]
data_thread_recover = []
for thread_name in data_thread_names:
thread = DataParser(thread_name, data_queue, mycollection, lock)
thread.start()
data_thread_recover.append(thread)
while not data_queue.empty():
pass
for thread in data_thread_recover:
if thread.is_alive():
thread.join()





if __name__=="__main__":
main()



代码如上,主要按照之前课程的案例进行编写。

但在实际的爬虫中会出现几个问题:

  1. 重复爬取同一部电影的信息

  2. 爬取到的评分和实际不符

  3. 爬取不到评价人数的数据


想问一下代码中存在什么问题?


正在回答

登陆购买课程后可参与讨论,去登陆

1回答

同学,你好!

1、data_queue中存储都是xpath获取的页面数据数据,解析数据时,先从data_queue中取出数据,在之前xpath节点的基础上进行解析,同学的代码parse_data()方法中的xpath(//div)是直接从网页中解析,因此会获取到重复的所有数据,可参考以下代码修改,爬取到评分也将是正确的

https://img1.sycdn.imooc.com//climg/614f01af094718d008860278.jpg

https://img1.sycdn.imooc.com//climg/614f017c095b16c413500475.jpg

2、评价人的xpath语句需要调整一下,可参考以下写法

https://img1.sycdn.imooc.com//climg/614f0486098db7fe13500232.jpg

祝:学习愉快!

  • 坻屿 提问者 #1

    两个针对评价人数的xpath语句在xpathhelper显示的结果都是一样的,为什么更改后的语句才能够爬取到数据呢?

    2021-09-25 21:42:05
  • 好帮手慕美 回复 提问者 坻屿 #2

    同学,你好!使用同学的xpath语句会出现获取不到结果的情况,因此该xpath语句需要调整

    https://img1.sycdn.imooc.com//climg/614fcfde09bf403b10090160.jpg

    祝学习愉快!

    2021-09-26 09:43:24
问题已解决,确定采纳
还有疑问,暂不采纳

恭喜解决一个难题,获得1积分~

来为老师/同学的回答评分吧

0 星
请稍等 ...
意见反馈 帮助中心 APP下载
官方微信

在线咨询

领取优惠

免费试听

领取大纲

扫描二维码,添加
你的专属老师