没有报错,数据抓取也正常,但是数据库没有写入数据,这个是什么情况?
import requests from multiprocessing import Queue from lxml import etree import threading from study_crawl_51job.handle_mongo import insert_data # 处理页码类 class Crawl_page(threading.Thread): # 重写父类 def __init__(self, thread_name, page_queue, data_queue): super(Crawl_page, self).__init__() # 线程的名称 self.thread_name = thread_name # 页码的对列 self.page_queue = page_queue # 数据的对列 self.data_queue = data_queue # 默认请求头 self.header = { "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Accept-Encoding":"gzip, deflate, br", "Accept-Language":"zh-CN,zh;q=0.9", "Connection":"keep-alive", "Cookie":"guid=ff0bfbbf2af5c55ff3b12bdf30794cf0; slife=lowbrowser%3Dnot%26%7C%26; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; 51job=cenglish%3D0%26%7C%26; search=jobarea%7E%60010000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60010000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FApython%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60040000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA05%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%CA%FD%BE%DD%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch2%7E%60040000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%CA%FD%BE%DD%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch3%7E%60190200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%CA%FD%BE%DD%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch4%7E%60190200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%D6%D0%D0%BE%B9%A9%D3%A6%C1%B4%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21", "Host":"search.51job.com", "Sec-Fetch-Dest":"document", "Sec-Fetch-Mode":"navigate", "Sec-Fetch-Site":"none", "Upgrade-Insecure-Requests":"1", "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36", } def run(self): print("当前启动的处理页码的任务为:{0}".format(self.thread_name)) # while True while not page_flag: # Queue 对列去put或者get的时候,需要设置block # 它默认为True, 需要设置成false # 当前对列里没有数据了,将会抛出异常,empty, full try: # 通过get方法,将里面的页码get出来, get为空的时候,抛异常 page = self.page_queue.get(block=False) page_url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,'+str(page)+'.html' print("当前构造的url为{0}".format(page_url)) # 请求当前构造的url # 设置代理 proxy = { "http": "http://TZXE1670577616242459:2oY99U0ZZTrX@dyn.horocn.com:50000", "https": "http://TZXE1670577616242459:2oY99U0ZZTrX@dyn.horocn.com:50000", } # 此处加上了代理,requests方法请求 response = requests.get(url=page_url, headers=self.header, proxies=proxy) # 设置了网页编码 response.encoding = 'gb2312' # 将请求回来的网页文本数据放到数据对列里面去 self.data_queue.put(response.text) except: pass # 处理网页文本数据类 class Crawl_html(threading.Thread): # 从页码解析过来的文本数据,需要保持到data_queue def __init__(self, thread_name, data_queue, lock): super(Crawl_html, self).__init__() self.thread_name = thread_name self.data_queue = data_queue self.lock = lock def run(self): print("当前启动处理文本人物的线程为{}".format(self.thread_name)) while not data_flag: try: # 把文本数据get出来 text = self.data_queue.get(block=False) # 相应的方法进行处理 result = self.parse(text) with self.lock: insert_data.insert_db(result) except: pass # 处理网页的方法 def parse(self, text): # HTML实例化 html_51job = etree.HTML(text) all_div = html_51job.xpath("//div[@id='resultList']//div[@class='el']") info_list = [] for item in all_div: info = {} info['job_name'] = item.xpath("./p/span/a/@title")[0] info['company_name'] = item.xpath(".//span[@class='t2']/a/@title")[0] info['company_address'] = item.xpath(".//span[@class='t3']/text()")[0] info['salary'] = item.xpath(".//span[@class='t4']/text()")[0] info['job_upgrade'] = item.xpath(".//span[@class='t5']/text()")[0] info_list.append(info) return info_list # 我们定义两个全局的flag page_flag = False data_flag = False def main(): # 定义两个队列, 存放页码的队列,存放文本数据的队列 page_queue = Queue() data_queue = Queue() # 定义一个锁 lock = threading.Lock() # 需要将页码放到页码队列里面去 for page in range(1, 685): page_queue.put(page, block=True, timeout=0) # 打印一个提示信息,page_queue.qsize() 返回当前队列的长度 # 列表,包含了线程的名称,页码线程,开了三个线程 crawl_page_list = ["页码处理线程1号", "页码处理线程2号", "页码处理线程3号"] page_thread_list = [] for thread_name_page in crawl_page_list: thread_page = Crawl_page(thread_name_page, page_queue, data_queue) # 启动线程 thread_page.start() page_thread_list.append(thread_page) # 设置三个线程,处理文本数据 parseList = ["文本处理线程1号", "文本处理线程2号", "文本处理线程3号"] parse_thread_list = [] for thread_name_parse in parseList: thread_parse = Crawl_html(thread_name_parse, data_queue, lock) thread_parse.start() parse_thread_list.append(thread_parse) # 设置线程的退出机制 global page_flag # 在page_queue为空的时候,while就不成立 while not page_queue.empty(): pass page_flag = True # 结束页码处理线程 for thread_page_join in page_thread_list: thread_page_join.join() print(thread_page_join.thread_name, "处理结束") global data_flag while not data_queue.empty(): pass data_flag = True for thread_parse_join in parse_thread_list: thread_parse_join.join() print(thread_parse_join.thread_name, "处理结束") if __name__ == '__main__': main()
import pymongo class Mongo_client(object): def __init__(self): myclient = pymongo.MongoClient("mongodb://192.168.105.140:27017") mydb = myclient['db_51job'] self.mycollection = mydb['collection_51job'] def insert_db(self, item): self.mycollection.insert_many(item) insert_data = Mongo_client()
9
收起
正在回答 回答被采纳积分+1
1回答
4.入门主流框架Scrapy与爬虫项目实战
- 参与学习 人
- 提交作业 107 份
- 解答问题 1672 个
Python最广为人知的应用就是爬虫了,有趣且酷的爬虫技能并没有那么遥远,本阶段带你学会利用主流Scrapy框架完成爬取招聘网站和二手车网站的项目实战。
了解课程
恭喜解决一个难题,获得1积分~
来为老师/同学的回答评分吧
0 星