没有报错,数据抓取也正常,但是数据库没有写入数据,这个是什么情况?
import requests
from multiprocessing import Queue
from lxml import etree
import threading
from study_crawl_51job.handle_mongo import insert_data
# 处理页码类
class Crawl_page(threading.Thread):
# 重写父类
def __init__(self, thread_name, page_queue, data_queue):
super(Crawl_page, self).__init__()
# 线程的名称
self.thread_name = thread_name
# 页码的对列
self.page_queue = page_queue
# 数据的对列
self.data_queue = data_queue
# 默认请求头
self.header = {
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding":"gzip, deflate, br",
"Accept-Language":"zh-CN,zh;q=0.9",
"Connection":"keep-alive",
"Cookie":"guid=ff0bfbbf2af5c55ff3b12bdf30794cf0; slife=lowbrowser%3Dnot%26%7C%26; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; 51job=cenglish%3D0%26%7C%26; search=jobarea%7E%60010000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60010000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FApython%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60040000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA05%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%CA%FD%BE%DD%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch2%7E%60040000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%CA%FD%BE%DD%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch3%7E%60190200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%CA%FD%BE%DD%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch4%7E%60190200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%D6%D0%D0%BE%B9%A9%D3%A6%C1%B4%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21",
"Host":"search.51job.com",
"Sec-Fetch-Dest":"document",
"Sec-Fetch-Mode":"navigate",
"Sec-Fetch-Site":"none",
"Upgrade-Insecure-Requests":"1",
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
}
def run(self):
print("当前启动的处理页码的任务为:{0}".format(self.thread_name))
# while True
while not page_flag:
# Queue 对列去put或者get的时候,需要设置block
# 它默认为True, 需要设置成false
# 当前对列里没有数据了,将会抛出异常,empty, full
try:
# 通过get方法,将里面的页码get出来, get为空的时候,抛异常
page = self.page_queue.get(block=False)
page_url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,'+str(page)+'.html'
print("当前构造的url为{0}".format(page_url))
# 请求当前构造的url
# 设置代理
proxy = {
"http": "http://TZXE1670577616242459:2oY99U0ZZTrX@dyn.horocn.com:50000",
"https": "http://TZXE1670577616242459:2oY99U0ZZTrX@dyn.horocn.com:50000",
}
# 此处加上了代理,requests方法请求
response = requests.get(url=page_url, headers=self.header, proxies=proxy)
# 设置了网页编码
response.encoding = 'gb2312'
# 将请求回来的网页文本数据放到数据对列里面去
self.data_queue.put(response.text)
except:
pass
# 处理网页文本数据类
class Crawl_html(threading.Thread):
# 从页码解析过来的文本数据,需要保持到data_queue
def __init__(self, thread_name, data_queue, lock):
super(Crawl_html, self).__init__()
self.thread_name = thread_name
self.data_queue = data_queue
self.lock = lock
def run(self):
print("当前启动处理文本人物的线程为{}".format(self.thread_name))
while not data_flag:
try:
# 把文本数据get出来
text = self.data_queue.get(block=False)
# 相应的方法进行处理
result = self.parse(text)
with self.lock:
insert_data.insert_db(result)
except:
pass
# 处理网页的方法
def parse(self, text):
# HTML实例化
html_51job = etree.HTML(text)
all_div = html_51job.xpath("//div[@id='resultList']//div[@class='el']")
info_list = []
for item in all_div:
info = {}
info['job_name'] = item.xpath("./p/span/a/@title")[0]
info['company_name'] = item.xpath(".//span[@class='t2']/a/@title")[0]
info['company_address'] = item.xpath(".//span[@class='t3']/text()")[0]
info['salary'] = item.xpath(".//span[@class='t4']/text()")[0]
info['job_upgrade'] = item.xpath(".//span[@class='t5']/text()")[0]
info_list.append(info)
return info_list
# 我们定义两个全局的flag
page_flag = False
data_flag = False
def main():
# 定义两个队列, 存放页码的队列,存放文本数据的队列
page_queue = Queue()
data_queue = Queue()
# 定义一个锁
lock = threading.Lock()
# 需要将页码放到页码队列里面去
for page in range(1, 685):
page_queue.put(page, block=True, timeout=0)
# 打印一个提示信息,page_queue.qsize() 返回当前队列的长度
# 列表,包含了线程的名称,页码线程,开了三个线程
crawl_page_list = ["页码处理线程1号", "页码处理线程2号", "页码处理线程3号"]
page_thread_list = []
for thread_name_page in crawl_page_list:
thread_page = Crawl_page(thread_name_page, page_queue, data_queue)
# 启动线程
thread_page.start()
page_thread_list.append(thread_page)
# 设置三个线程,处理文本数据
parseList = ["文本处理线程1号", "文本处理线程2号", "文本处理线程3号"]
parse_thread_list = []
for thread_name_parse in parseList:
thread_parse = Crawl_html(thread_name_parse, data_queue, lock)
thread_parse.start()
parse_thread_list.append(thread_parse)
# 设置线程的退出机制
global page_flag
# 在page_queue为空的时候,while就不成立
while not page_queue.empty():
pass
page_flag = True
# 结束页码处理线程
for thread_page_join in page_thread_list:
thread_page_join.join()
print(thread_page_join.thread_name, "处理结束")
global data_flag
while not data_queue.empty():
pass
data_flag = True
for thread_parse_join in parse_thread_list:
thread_parse_join.join()
print(thread_parse_join.thread_name, "处理结束")
if __name__ == '__main__':
main()import pymongo
class Mongo_client(object):
def __init__(self):
myclient = pymongo.MongoClient("mongodb://192.168.105.140:27017")
mydb = myclient['db_51job']
self.mycollection = mydb['collection_51job']
def insert_db(self, item):
self.mycollection.insert_many(item)
insert_data = Mongo_client()


9
收起
正在回答 回答被采纳积分+1
1回答
4.入门主流框架Scrapy与爬虫项目实战
- 参与学习 人
- 提交作业 107 份
- 解答问题 1672 个
Python最广为人知的应用就是爬虫了,有趣且酷的爬虫技能并没有那么遥远,本阶段带你学会利用主流Scrapy框架完成爬取招聘网站和二手车网站的项目实战。
了解课程

恭喜解决一个难题,获得1积分~
来为老师/同学的回答评分吧
0 星