scrapy只能获取最后一张图片
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 | import scrapy import re from venv.moc.parsetest.scrapy001.scrapy_project.tubatu.items import TubatuItem class TuvatuSpider(scrapy.Spider): name = 'tubatu' allowed_domains = [ 'to8to.com' ] start_urls = [ 'https://xiaoguotu.to8to.com/case/type3/p1.html' ] def parse( self , response): print (response.request.headers) pic_item_list = response.xpath( '//div[@class = "item"]' ) for item in pic_item_list: info = {} info[ "content_name" ] = item.xpath( './/div//a[@class="title"]/text()' ).extract_first().replace( '\n' ,'') info[ "content_url" ] = item.xpath( './/div//a[@class="title"]/@href' ).extract_first() yield scrapy.Request(info[ "content_url" ],callback = self .hanle_pic_parse,meta = info) break # if response.xpath("//a[@id='nextpageid']"): # now_page = int(response.xpath("//div[@class='pages']/strong/text()").extract_first()) # next_page_url = 'https://xiaoguotu.to8to.com/case/type3/p%d.html' % (now_page + 1) # # print(now_page) # yield scrapy.Request(url=next_page_url, callback=self.parse) # for i in range(2,3): # next_url = 'https://xiaoguotu.to8to.com/case/type3/p%d.html' % (i) # print("当前爬取第{}页".format(i)) # yield scrapy.Request(url=next_url, callback=self.parse) def hanle_pic_parse( self ,response): tubatu_info = TubatuItem() tubatu_info[ "content_name" ] = response.request.meta[ "content_name" ] tubatu_info[ "content_url" ] = response.request.meta[ "content_url" ] # tubatu_info['case_designer'] = response.xpath('//div[@class="small-avatar"]//div/img/@alt').extract_first() tubatu_info[ 'house_type' ] = response.xpath( '//div[@class="info-wrapper"]//span[1]/a/text()' ).extract_first() tubatu_info[ 'house_style' ] = response.xpath( '//div[@class="info-wrapper"]//span[2]/a/text()' ).extract_first() tubatu_info[ 'area' ] = response.xpath( '//div[@class="info-wrapper"]//span[3]/a/text()' ).extract_first() pic_url_list = response.xpath( '//div[@id="js-viewer"]//img//@data-original' ) for pic_url in pic_url_list: pic_id_search = re. compile (r "case/(.*?)\?x-oss-process" ) id = pic_id_search.search(pic_url.extract()).group( 1 ) tubatu_info[ "pic_name" ] = id tubatu_info[ "image_urls" ] = [pic_url.extract()] yield tubatu_info |
相关代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 | import pymongo from scrapy.pipelines.images import ImagesPipeline from scrapy.exceptions import DropItem class TubatuPipeline( object ): def __init__( self ): myclient = pymongo.MongoClient(host = 'linzpao.zjjy.vip' ,port = 27017 ,username = 'linzpao-admin' , password = 'linzpao123' ) mydb = myclient[ 'linzpao' ] self .mycollection = mydb[ 'tubutu' ] def process_item( self , item, spider): # self.mycollection.delete_many({}) data = dict (item) self .mycollection.insert_one(data) return item class tubatuImagePipeline(ImagesPipeline): # def get_media_requests(self, item, info): # pass def item_completed( self , results, item, info): image_paths = [x [ 'path' ] for ok,x in results if ok] if not image_paths: raise DropItem( 'item contains no images' ) return item def file_path( self , request, response = None , info = None , * , item): print (item[ 'pic_name' ]) file_name = item[ 'pic_name' ].split( '/' )[ - 1 ] return file_name |
问题描述:
请老师帮我看看,为什么每个主题只能获取一张图片
13
收起
正在回答 回答被采纳积分+1
1回答
Python全能工程师
- 参与学习 人
- 提交作业 16329 份
- 解答问题 4470 个
全新版本覆盖5大热门就业方向:Web全栈、爬虫、数据分析、软件测试、人工智能,零基础进击Python全能型工程师,从大厂挑人到我挑大厂,诱人薪资在前方!
了解课程
恭喜解决一个难题,获得1积分~
来为老师/同学的回答评分吧