老师,为什么会一直显示握手失败啊,而且代码也没有运行下去

老师,为什么会一直显示握手失败啊,而且代码也没有运行下去

https://img1.sycdn.imooc.com/climg/6561ee7009a10ed818370474.jpg

import scrapy
from ..items import BiliItem
import time
from scrapy_redis.spiders import RedisSpider

class AppSpider(scrapy.Spider):
    name = "app"
    redis_key = 'app'
    # allowed_domains = ["www.bilibili.com"]
    # start_urls = ["https://search.bilibili.com/all?vt=18668812&keyword=python&from_source=webtop_search&spm_id_from=333.1007&search_source=5"]
    page = 30

    def __init__(self, *args, **kwargs):
        domain = kwargs.pop('domain', '')
        self.allowed_domains = filter(None, domain.split(','))
        super(AppSpider, self).__init__(*args, **kwargs)

    def parse(self, response):
        time.sleep(5)
        try:
            link = response.xpath('//div[@class="video-list row"]/div')
            for lin in link:
                lin_url = 'https:' + lin.xpath('.//div[@class="bili-video-card__wrap __scale-wrap"]/a/@href').get()
                yield scrapy.Request(url=lin_url, callback=self.lin_parse, dont_filter=True)

            new_url = f"https://search.bilibili.com/all?keyword=python&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page=11&o={self.page}"
            print(new_url)
            self.page += 30
            print(self.page)
            yield scrapy.Request(url=new_url, callback=self.parse, dont_filter=True)
            print('进行翻页')

        except :
            print('end')

    def lin_parse(self, response):
        item = BiliItem()

        item['dianzhan'] = response.xpath('//div[@class="video-like video-toolbar-left-item"]/span/text()').get()
        item['toubi'] = response.xpath('//div[@class="video-coin video-toolbar-left-item"]/span/text()').get()
        item['shouchang'] = response.xpath('//div[@class="video-fav video-toolbar-left-item"]/span/text()').get()
        item['zhuanfa'] = response.xpath('//div[@class="video-share"]/span/text()').get()
        yield item





# bili-video-card__wrap __scale-wrap
# bili-video-card__wrap __scale-wrap
# bili-video-card__wrap __scale-wrap
# 详情页
# https://www.bilibili.com/video/BV19a411k7fw/?spm_id_from=333.337.search-card.all.click&vd_source=01d1cc167d01b5a7cfe6be617b3d6c6e

https://img1.sycdn.imooc.com/climg/6561ee590903cae919201149.jpg

BOT_NAME = "bili"
SPIDER_MODULES = ["bili.spiders"]
NEWSPIDER_MODULE = "bili.spiders"


USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"

ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'

#SPIDER_MIDDLEWARES = {
#    "bili.middlewares.BiliSpiderMiddleware": 543,
#}

DOWNLOADER_MIDDLEWARES = {
   "bili.middlewares.BiliDownloaderMiddleware": 543,
}




ITEM_PIPELINES = {
   "bili.pipelines.BiliPipeline": 300,
   'scrapy_redis.pipelines.RedisPipeline': 400
}

DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
SCHEDULER = 'scrapy_redis.scheduler.Scheduler'
SCHEDULER_PERSIST = True

REDIS_URL = 'redis://127.0.0.1:6379'
DOWNLOAD_DELAY = 1
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class BiliItem(scrapy.Item):
    dianzhan = scrapy.Field()
    toubi = scrapy.Field()
    shouchang = scrapy.Field()
    zhuanfa = scrapy.Field()
    pass
import time

from scrapy import signals
from itemadapter import is_item, ItemAdapter
from selenium import webdriver
from selenium.webdriver.chrome.service import Service




class BiliDownloaderMiddleware:
    # 启动浏览器,初始化cookies
    def __init__(self):
        self.sevice = Service(executable_path="D:/爬虫驱动/chromedriver.exe")
        self.driver = webdriver.Chrome(service=self.sevice)
        self.cookies = {}
# 如果cookies为空,那么发送请求获取cookies
    def process_request(self, request, spider):
        if self.cookies == {}:
            self.driver.get('https://www.bilibili.com/')
            self.driver.maximize_window()
            time.sleep(1)
            # 获取首页cookie
            cookies = self.driver.get_cookies()
            # 提取想要的cookie值
            dick = {cookie['name']: cookie['value'] for cookie in cookies}
            self.cookies = dick
            # print(self.cookies)

            time.sleep(1)
            self.driver.quit()
        # 如果cookies不等于空,那就加到request的cookies里面
        # 等于空就获取一遍
        request.cookies = self.cookies


        return None
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter


class BiliPipeline:
    def process_item(self, item, spider):
        data = {
            'dianzhan': item['dianzhan'],
            'toubi': item['toubi'],
            'shouchang': item['shouchang'],
            'zhuanfa': item['zhuanfa']
        }
        print(data)
        return item


正在回答 回答被采纳积分+1

登陆购买课程后可参与讨论,去登陆

1回答
好帮手慕小猿 2023-11-28 20:29:43

同学,你好!同学AppSpider应该继承RedisSpider

https://img1.sycdn.imooc.com/climg/6565dcff0929d8a013320479.jpg

祝学习愉快~

  • 提问者 吴老师在线叛逃 #1

    https://img1.sycdn.imooc.com/climg/656b0f1b094de6f418201081.jpghttps://img1.sycdn.imooc.com/climg/656b0f30094bf1cf11620481.jpg

    import scrapy
    from ..items import BiliItem
    import time
    from scrapy_redis.spiders import RedisSpider
    
    class AppSpider(RedisSpider):
        name = "app"
        redis_key = 'app'
        # allowed_domains = ["www.bilibili.com"]
        # start_urls = ["https://search.bilibili.com/all?vt=18668812&keyword=python&from_source=webtop_search&spm_id_from=333.1007&search_source=5"]
        page = 30
        num = 2
    
        def __init__(self, *args, **kwargs):
            domain = kwargs.pop('domain', '')
            self.allowed_domains = filter(None, domain.split(','))
            super(AppSpider, self).__init__(*args, **kwargs)
    
        def parse(self, response):
            time.sleep(5)
            try:
                link = response.xpath('//div[@class="video-list row"]/div')
                for lin in link:
                    lin_url = 'https:' + lin.xpath('.//div[@class="bili-video-card__wrap __scale-wrap"]/a/@href').get()
                    yield scrapy.Request(url=lin_url, callback=self.lin_parse, dont_filter=True)
    
                new_url = f"https://search.bilibili.com/all?keyword=python&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page={self.num}&o={self.page}"
                print(new_url)
                self.num += 1
                self.page += 30
                yield scrapy.Request(url=new_url, callback=self.parse, dont_filter=True)
                print('进行翻页')
    
            except :
                print('end')
    
        def lin_parse(self, response):
            item = BiliItem()
    
            item['dianzhan'] = response.xpath('//div[@class="video-like video-toolbar-left-item"]/span/text()').get()
            item['toubi'] = response.xpath('//div[@class="video-coin video-toolbar-left-item"]/span/text()').get()
            item['shouchang'] = response.xpath('//div[@class="video-fav video-toolbar-left-item"]/span/text()').get()
            item['zhuanfa'] = response.xpath('//div[@class="video-share"]/span/text()').get()
            yield item
    BOT_NAME = "bili"
    SPIDER_MODULES = ["bili.spiders"]
    NEWSPIDER_MODULE = "bili.spiders"
    
    
    USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
    
    ROBOTSTXT_OBEY = False
    LOG_LEVEL = 'ERROR'
    
    #SPIDER_MIDDLEWARES = {
    #    "bili.middlewares.BiliSpiderMiddleware": 543,
    #}
    
    DOWNLOADER_MIDDLEWARES = {
       "bili.middlewares.BiliDownloaderMiddleware": 543,
    }
    
    
    
    
    ITEM_PIPELINES = {
       "bili.pipelines.BiliPipeline": 300,
       'scrapy_redis.pipelines.RedisPipeline': 400
    }
    
    DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
    SCHEDULER = 'scrapy_redis.scheduler.Scheduler'
    SCHEDULER_PERSIST = True
    
    REDIS_URL = 'redis://127.0.0.1:6379'
    DOWNLOAD_DELAY = 1
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    
    
    # useful for handling different item types with a single interface
    from itemadapter import ItemAdapter
    
    
    class BiliPipeline:
        def process_item(self, item, spider):
            data = {
                'dianzhan': item['dianzhan'],
                'toubi': item['toubi'],
                'shouchang': item['shouchang'],
                'zhuanfa': item['zhuanfa']
            }
            print(data)
            return item
    import time
    
    from scrapy import signals
    from itemadapter import is_item, ItemAdapter
    from selenium import webdriver
    from selenium.webdriver.chrome.service import Service
    
    
    
    
    class BiliDownloaderMiddleware:
        # 启动浏览器,初始化cookies
        def __init__(self):
            self.sevice = Service(executable_path="D:/爬虫驱动/chromedriver.exe")
            self.driver = webdriver.Chrome(service=self.sevice)
            self.cookies = {}
    # 如果cookies为空,那么发送请求获取cookies
        def process_request(self, request, spider):
            if self.cookies == {}:
                self.driver.get('https://www.bilibili.com/')
                self.driver.maximize_window()
                time.sleep(1)
                # 获取首页cookie
                cookies = self.driver.get_cookies()
                # 提取想要的cookie值
                dick = {cookie['name']: cookie['value'] for cookie in cookies}
                self.cookies = dick
                # print(self.cookies)
    
                time.sleep(1)
                self.driver.quit()
            # 如果cookies不等于空,那就加到request的cookies里面
            # 等于空就获取一遍
            request.cookies = self.cookies
    
    
            return None
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://docs.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class BiliItem(scrapy.Item):
        dianzhan = scrapy.Field()
        toubi = scrapy.Field()
        shouchang = scrapy.Field()
        zhuanfa = scrapy.Field()
        pass

    老师,为什么会报错啊

    2023-12-02 19:05:51
  • 同学,你好!因为在middleware.py中请求睡眠时间过短,还没获取到cookie值 ,所以报错。同学可以将睡眠时间调到5,这样可以获取成功cookie

    https://img1.sycdn.imooc.com/climg/656d6ede09dd6e4314760610.jpg

    祝学习愉快~


    2023-12-04 14:18:08
  • https://img1.sycdn.imooc.com/climg/65741437092cd12c13151116.jpg老师,我修改代码之后还是跟之前一样。我运行代码之后获取cookie那个步骤好像出问题了,页面就一直卡在这没有动过https://img1.sycdn.imooc.com/climg/657414b809b48cae18220924.jpg

    2023-12-09 15:20:25
问题已解决,确定采纳
还有疑问,暂不采纳

恭喜解决一个难题,获得1积分~

来为老师/同学的回答评分吧

0 星
请稍等 ...
意见反馈 帮助中心 APP下载
官方微信

在线咨询

领取优惠

免费试听

领取大纲

扫描二维码,添加
你的专属老师