老师,我运行代码之后,返回结果显示需要登录才能获取信息,登录的时候需要滑动验证码。这种情况我直接使用登陆后的cookie。然后携带cookie进行访问不可以吗
import scrapy
class AppSpider(scrapy.Spider):
name = "app"
# allowed_domains = ["www.zhipin.com"]
# start_urls = ['http://search.dangdang.com/?key=%B2%E9%C0%ED%BE%C5%CA%C0&act=input&page_index=1']
def warn_on_generator_with_return_value_stub(spider, callable):
pass
scrapy.utils.misc.warn_on_generator_with_return_value = warn_on_generator_with_return_value_stub
scrapy.core.scraper.warn_on_generator_with_return_value = warn_on_generator_with_return_value_stub
# 携带cookie进行登录,然后调用parse函数
def start_requests(self):
url = 'http://search.dangdang.com/?key=%B2%E9%C0%ED%BE%C5%CA%C0&act=input&page_index=1'
cookie = '__permanent_id=20230920231611757816761566318461699; dangdang.com=email=ZDA3NjU1NWFlOTE4NWRlN0BkZG1vYmlsZV91c2VyLmNvbQ==&nickname=&display_id=2358298867535&customerid=l8UnhPkhztLqXMfVrAcE7A==&viptype=c/5GHQl/FjE=&show_name=133****7052; ddscreen=2; __visit_id=20231012101515542981780980223343448; __out_refer=; dest_area=country_id%3D9000%26province_id%3D111%26city_id%3D0%26district_id%3D0%26town_id%3D0; pos_9_end=1697077157295; pos_0_start=1697077157394; pos_0_end=1697077157409; ad_ids=3539000%7C%232; USERNUM=KYktRh56D4dUdZlNxID1Kg==; login.dangdang.com=.ASPXAUTH=1kUydnmmYRdqIlrVed3mVv18ivPovzVc3NyaGVO/DOohvFfXaj9XVg==; MDD_username=133****7052; MDD_custId=q7m4VVqnwYI6F5NvHTgn/A%3D%3D; MDD_channelId=70000; MDD_fromPlatform=307; sessionID=pc_70e940e0219e7b710f2b4f5b11219f145a99a5bfbb4acee70d6dd3c281770f; ddoy=email=d076555ae9185de7@ddmobile_user.com&nickname=&validatedflag=0&uname=13337577052&utype=0&.ALFG=off&.ALTM=1697079067582; LOGIN_TIME=1697079069616; pos_6_start=1697079069798; pos_6_end=1697079070096; __rpm=%7Cp_11557235022...1697079073686; __trace_id=20231012105116720338014286415209942'
cookie = {item.split("=")[0]: item.split("=")[1] for item in cookie.split('; ')}
yield scrapy.Request(url=url, callback=self.parse, cookies=cookie)
# 获取每一页的详情页url,然后
def parse(self, response):
print('------------------------------')
link = response.xpath('.//ul[@class="bigimg"]/li')
for i in link:
urls = i.xpath('.//a/@href').get()
urls = 'http:' + urls
yield scrapy.Request(url=urls, callback=self.parse_init)
# 对详情页数据解析,对后续页面进行获取
def parse_init(self, response):
# title = response.xpath('/html/body/div[2]/div[3]/div[2]/div/div[1]/div[1]/h1/text()').get()
# price = response.xpath("/html/body/div[2]/div[3]/div[2]/div/div[1]/div[6]/div[2]/div[1]/div[1]/p[2]/text()").get()
# author = response.xpath('/html/body/div[2]/div[3]/div[2]/div/div[1]/div[2]/span[1]/a/text()').get()
# press = response.xpath('/html/body/div[2]/div[3]/div[2]/div/div[1]/div[2]/span[2]/a/text()').get()
# store_name = response.xpath('/html/body/div[2]/div[3]/div[2]/div/div[2]/div[2]/p[1]/span/span[2]/span/a/text()').get()
title = response.xpath('//*[@id="product_info"]/div[1]/h1')
if title:
title = title[0].extract().strip().strip(' ')
print(title)
with open('./cl.txt', 'a', encoding='utf-8') as f:
f.write(title)
num = 2
while num <= 100:
urls = f'http://search.dangdang.com/?key=%B2%E9%C0%ED%BE%C5%CA%C0&act=input&page_index={num}'
# print(f'对{num}页进行解析')
num = num + 1
yield scrapy.Request(url=urls, callback=self.parse)

12
收起
恭喜解决一个难题,获得1积分~
来为老师/同学的回答评分吧
0 星