没有报错,数据抓取也正常,但是数据库没有写入数据,这个是什么情况?
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 | import requests from multiprocessing import Queue from lxml import etree import threading from study_crawl_51job.handle_mongo import insert_data # 处理页码类 class Crawl_page(threading.Thread): # 重写父类 def __init__( self , thread_name, page_queue, data_queue): super (Crawl_page, self ).__init__() # 线程的名称 self .thread_name = thread_name # 页码的对列 self .page_queue = page_queue # 数据的对列 self .data_queue = data_queue # 默认请求头 self .header = { "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" , "Accept-Encoding" : "gzip, deflate, br" , "Accept-Language" : "zh-CN,zh;q=0.9" , "Connection" : "keep-alive" , "Cookie" : "guid=ff0bfbbf2af5c55ff3b12bdf30794cf0; slife=lowbrowser%3Dnot%26%7C%26; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; 51job=cenglish%3D0%26%7C%26; search=jobarea%7E%60010000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60010000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FApython%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60040000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA05%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%CA%FD%BE%DD%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch2%7E%60040000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%CA%FD%BE%DD%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch3%7E%60190200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%CA%FD%BE%DD%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch4%7E%60190200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%D6%D0%D0%BE%B9%A9%D3%A6%C1%B4%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21" , "Host" : "search.51job.com" , "Sec-Fetch-Dest" : "document" , "Sec-Fetch-Mode" : "navigate" , "Sec-Fetch-Site" : "none" , "Upgrade-Insecure-Requests" : "1" , "User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36" , } def run( self ): print ( "当前启动的处理页码的任务为:{0}" . format ( self .thread_name)) # while True while not page_flag: # Queue 对列去put或者get的时候,需要设置block # 它默认为True, 需要设置成false # 当前对列里没有数据了,将会抛出异常,empty, full try : # 通过get方法,将里面的页码get出来, get为空的时候,抛异常 page = self .page_queue.get(block = False ) page_url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,' + str (page) + '.html' print ( "当前构造的url为{0}" . format (page_url)) # 请求当前构造的url # 设置代理 proxy = { "http" : "http://TZXE1670577616242459:2oY99U0ZZTrX@dyn.horocn.com:50000" , "https" : "http://TZXE1670577616242459:2oY99U0ZZTrX@dyn.horocn.com:50000" , } # 此处加上了代理,requests方法请求 response = requests.get(url = page_url, headers = self .header, proxies = proxy) # 设置了网页编码 response.encoding = 'gb2312' # 将请求回来的网页文本数据放到数据对列里面去 self .data_queue.put(response.text) except : pass # 处理网页文本数据类 class Crawl_html(threading.Thread): # 从页码解析过来的文本数据,需要保持到data_queue def __init__( self , thread_name, data_queue, lock): super (Crawl_html, self ).__init__() self .thread_name = thread_name self .data_queue = data_queue self .lock = lock def run( self ): print ( "当前启动处理文本人物的线程为{}" . format ( self .thread_name)) while not data_flag: try : # 把文本数据get出来 text = self .data_queue.get(block = False ) # 相应的方法进行处理 result = self .parse(text) with self .lock: insert_data.insert_db(result) except : pass # 处理网页的方法 def parse( self , text): # HTML实例化 html_51job = etree.HTML(text) all_div = html_51job.xpath( "//div[@id='resultList']//div[@class='el']" ) info_list = [] for item in all_div: info = {} info[ 'job_name' ] = item.xpath( "./p/span/a/@title" )[ 0 ] info[ 'company_name' ] = item.xpath( ".//span[@class='t2']/a/@title" )[ 0 ] info[ 'company_address' ] = item.xpath( ".//span[@class='t3']/text()" )[ 0 ] info[ 'salary' ] = item.xpath( ".//span[@class='t4']/text()" )[ 0 ] info[ 'job_upgrade' ] = item.xpath( ".//span[@class='t5']/text()" )[ 0 ] info_list.append(info) return info_list # 我们定义两个全局的flag page_flag = False data_flag = False def main(): # 定义两个队列, 存放页码的队列,存放文本数据的队列 page_queue = Queue() data_queue = Queue() # 定义一个锁 lock = threading.Lock() # 需要将页码放到页码队列里面去 for page in range ( 1 , 685 ): page_queue.put(page, block = True , timeout = 0 ) # 打印一个提示信息,page_queue.qsize() 返回当前队列的长度 # 列表,包含了线程的名称,页码线程,开了三个线程 crawl_page_list = [ "页码处理线程1号" , "页码处理线程2号" , "页码处理线程3号" ] page_thread_list = [] for thread_name_page in crawl_page_list: thread_page = Crawl_page(thread_name_page, page_queue, data_queue) # 启动线程 thread_page.start() page_thread_list.append(thread_page) # 设置三个线程,处理文本数据 parseList = [ "文本处理线程1号" , "文本处理线程2号" , "文本处理线程3号" ] parse_thread_list = [] for thread_name_parse in parseList: thread_parse = Crawl_html(thread_name_parse, data_queue, lock) thread_parse.start() parse_thread_list.append(thread_parse) # 设置线程的退出机制 global page_flag # 在page_queue为空的时候,while就不成立 while not page_queue.empty(): pass page_flag = True # 结束页码处理线程 for thread_page_join in page_thread_list: thread_page_join.join() print (thread_page_join.thread_name, "处理结束" ) global data_flag while not data_queue.empty(): pass data_flag = True for thread_parse_join in parse_thread_list: thread_parse_join.join() print (thread_parse_join.thread_name, "处理结束" ) if __name__ = = '__main__' : main() |
1 2 3 4 5 6 7 8 9 10 11 12 13 | import pymongo class Mongo_client( object ): def __init__( self ): myclient = pymongo.MongoClient( "mongodb://192.168.105.140:27017" ) mydb = myclient[ 'db_51job' ] self .mycollection = mydb[ 'collection_51job' ] def insert_db( self , item): self .mycollection.insert_many(item) insert_data = Mongo_client() |
9
收起
正在回答 回答被采纳积分+1
1回答
4.入门主流框架Scrapy与爬虫项目实战
- 参与学习 人
- 提交作业 107 份
- 解答问题 1672 个
Python最广为人知的应用就是爬虫了,有趣且酷的爬虫技能并没有那么遥远,本阶段带你学会利用主流Scrapy框架完成爬取招聘网站和二手车网站的项目实战。
了解课程
恭喜解决一个难题,获得1积分~
来为老师/同学的回答评分吧