老师,我运行代码之后,返回结果显示需要登录才能获取信息,登录的时候需要滑动验证码。这种情况我直接使用登陆后的cookie。然后携带cookie进行访问不可以吗
import scrapy class AppSpider(scrapy.Spider): name = "app" # allowed_domains = ["www.zhipin.com"] # start_urls = ['http://search.dangdang.com/?key=%B2%E9%C0%ED%BE%C5%CA%C0&act=input&page_index=1'] def warn_on_generator_with_return_value_stub(spider, callable): pass scrapy.utils.misc.warn_on_generator_with_return_value = warn_on_generator_with_return_value_stub scrapy.core.scraper.warn_on_generator_with_return_value = warn_on_generator_with_return_value_stub # 携带cookie进行登录,然后调用parse函数 def start_requests(self): url = 'http://search.dangdang.com/?key=%B2%E9%C0%ED%BE%C5%CA%C0&act=input&page_index=1' cookie = '__permanent_id=20230920231611757816761566318461699; dangdang.com=email=ZDA3NjU1NWFlOTE4NWRlN0BkZG1vYmlsZV91c2VyLmNvbQ==&nickname=&display_id=2358298867535&customerid=l8UnhPkhztLqXMfVrAcE7A==&viptype=c/5GHQl/FjE=&show_name=133****7052; ddscreen=2; __visit_id=20231012101515542981780980223343448; __out_refer=; dest_area=country_id%3D9000%26province_id%3D111%26city_id%3D0%26district_id%3D0%26town_id%3D0; pos_9_end=1697077157295; pos_0_start=1697077157394; pos_0_end=1697077157409; ad_ids=3539000%7C%232; USERNUM=KYktRh56D4dUdZlNxID1Kg==; login.dangdang.com=.ASPXAUTH=1kUydnmmYRdqIlrVed3mVv18ivPovzVc3NyaGVO/DOohvFfXaj9XVg==; MDD_username=133****7052; MDD_custId=q7m4VVqnwYI6F5NvHTgn/A%3D%3D; MDD_channelId=70000; MDD_fromPlatform=307; sessionID=pc_70e940e0219e7b710f2b4f5b11219f145a99a5bfbb4acee70d6dd3c281770f; ddoy=email=d076555ae9185de7@ddmobile_user.com&nickname=&validatedflag=0&uname=13337577052&utype=0&.ALFG=off&.ALTM=1697079067582; LOGIN_TIME=1697079069616; pos_6_start=1697079069798; pos_6_end=1697079070096; __rpm=%7Cp_11557235022...1697079073686; __trace_id=20231012105116720338014286415209942' cookie = {item.split("=")[0]: item.split("=")[1] for item in cookie.split('; ')} yield scrapy.Request(url=url, callback=self.parse, cookies=cookie) # 获取每一页的详情页url,然后 def parse(self, response): print('------------------------------') link = response.xpath('.//ul[@class="bigimg"]/li') for i in link: urls = i.xpath('.//a/@href').get() urls = 'http:' + urls yield scrapy.Request(url=urls, callback=self.parse_init) # 对详情页数据解析,对后续页面进行获取 def parse_init(self, response): # title = response.xpath('/html/body/div[2]/div[3]/div[2]/div/div[1]/div[1]/h1/text()').get() # price = response.xpath("/html/body/div[2]/div[3]/div[2]/div/div[1]/div[6]/div[2]/div[1]/div[1]/p[2]/text()").get() # author = response.xpath('/html/body/div[2]/div[3]/div[2]/div/div[1]/div[2]/span[1]/a/text()').get() # press = response.xpath('/html/body/div[2]/div[3]/div[2]/div/div[1]/div[2]/span[2]/a/text()').get() # store_name = response.xpath('/html/body/div[2]/div[3]/div[2]/div/div[2]/div[2]/p[1]/span/span[2]/span/a/text()').get() title = response.xpath('//*[@id="product_info"]/div[1]/h1') if title: title = title[0].extract().strip().strip(' ') print(title) with open('./cl.txt', 'a', encoding='utf-8') as f: f.write(title) num = 2 while num <= 100: urls = f'http://search.dangdang.com/?key=%B2%E9%C0%ED%BE%C5%CA%C0&act=input&page_index={num}' # print(f'对{num}页进行解析') num = num + 1 yield scrapy.Request(url=urls, callback=self.parse)
12
收起
恭喜解决一个难题,获得1积分~
来为老师/同学的回答评分吧
0 星