middlewares里设置的请求头,在parse里打印不出来
#middlewares.py
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
import random
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class TubatuScrapyProjectSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class TubatuScrapyProjectDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class my_useragent(object):
def process_request(self,request,spider):
#网上找的
user_agent_list=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
#随机选择一个数据
agent=random.choice(user_agent_list)
#request.headers设置了请求头
request.headers['User-Agent']=agenttubatu.py
import scrapy
from tubatu_scrapy_project.items import TubatuScrapyProjectItem
class TubatuSpider(scrapy.Spider):
#名称不能冲突,name
name = 'tubatu'
#允许爬虫去爬取的域名
allowed_domains = ['xiaoguotu.to8to.com']
#项目启动之后要启动的爬虫文件
start_urls = ['https://xiaoguotu.to8to.com/tuce_sort1?page=1']
#默认的解析方法
def parse(self, response):
print(response.request.hearders)
#response可以直接使用xpath
#response就是一个Html对象
pic_item_list=response.xpath("//div[@class='item' and @oldaid]")
for item in pic_item_list:
info={}
#extract_first这个方法来获取项目数据
# content_name=item.xpath(".//div/a/text()").extract()
#项目名称
info['content_name']=item.xpath(".//div/a/text()").extract_first()
#项目url
info['content_url']="https:"+item.xpath(".//div/a/@href").extract_first()
#使用yield发送异步请求
#使用scrapy.Request发送请求
#回调函数只写函数名称不要调用方法
yield scrapy.Request(url=info['content_url'],callback=self.handle_pic_parse,meta=info)
break
#页码逻辑
# if response.xpath("//a[@class='next']"):
# now_page=int(response.xpath("//strong/text()").extract_first())
# next_page_url='https://xiaoguotu.to8to.com/tuce_sort1?page={}'.format(now_page+1)
# yield scrapy.Request(url=next_page_url,callback=self.parse)
def handle_pic_parse(self,response):
tubatu_info = TubatuScrapyProjectItem()
tubatu_info['content_name']=response.request.meta['content_name']
tubatu_info['content_url']=response.request.meta['content_url']
pic_url_list=response.xpath("//ul[@class='img_list_container swiper-wrapper']/li[@data-xgtid]//img/@src").extract()
tubatu_info['pic_url_list']=pic_url_list
#yield到pipline
yield tubatu_info5
收起
正在回答 回答被采纳积分+1
2回答
Cherub1Zhuang
2021-12-30 00:43:15
找到原因了,headers拼错了
Cherub1Zhuang
2021-12-30 00:38:23
'Request' object has no attribute 'hearders'报的错误是这个
Python全能工程师
- 参与学习 人
- 提交作业 16416 份
- 解答问题 4469 个
全新版本覆盖5大热门就业方向:Web全栈、爬虫、数据分析、软件测试、人工智能,零基础进击Python全能型工程师,从大厂挑人到我挑大厂,诱人薪资在前方!
了解课程
恭喜解决一个难题,获得1积分~
来为老师/同学的回答评分吧
0 星