老师,我写的程序为啥爬不出数据
import threading
from multiprocessing import Queue
from threading import Thread
import requests
from lxml import etree
from study_crawl_douban.handle_mongo import douban_mongo
page_flag = False
data_flag = False
# 页码处理类
class Crawl_page(Thread):
# 重写父类的方法
def __init__(self, thread_name, page_queue, data_queue):
super(Crawl_page, self).__init__()
self.thread_name = thread_name
self.page_queue = page_queue
self.data_queue = data_queue
self.header = {
"Accept": " text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": " gzip, deflate, br",
"Accept-Language": " zh-CN,zh;q=0.9",
"Cache-Control": " no-cache",
"Connection": " keep-alive",
"Cookie": " bid=UK3_qCqd8Tw; __utmc=30149280; __utmz=30149280.1589176061.1.1.utmcsr=class.imooc.com|utmccn=(referral)|utmcmd=referral|utmcct=/lesson/1100; __utmc=223695111; __utmz=223695111.1589176061.1.1.utmcsr=class.imooc.com|utmccn=(referral)|utmcmd=referral|utmcct=/lesson/1100; __gads=ID=3a0d842ebc5e8488:T=1589176028:S=ALNI_MbmOwssmmLmbX5t1P8Hx-VVMX7-Ug; ct=y; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1589283906%2C%22https%3A%2F%2Fclass.imooc.com%2Flesson%2F1100%22%5D; _pk_ses.100001.4cf6=*; ap_v=0,6.0; __utma=30149280.1537622081.1589176061.1589249515.1589283907.3; __utmb=30149280.0.10.1589283907; __utma=223695111.1653981003.1589176061.1589249515.1589283907.3; __utmb=223695111.0.10.1589283907; _pk_id.100001.4cf6=d54cf7120f674951.1589176061.3.1589283922.1589249514.; __yadk_uid=EtEnWgLqtZb6nxHLeCnqXg0D9BV7P9yU",
"Host": " movie.douban.com",
"Pragma": " no-cache",
"Referer": " https://movie.douban.com/top250?start=25&filter=",
"Sec-Fetch-Dest": " document",
"Sec-Fetch-Mode": " navigate",
"Sec-Fetch-Site": " same-origin",
"Sec-Fetch-User": " ?1",
"Upgrade-Insecure-Requests": " 1",
"User-Agent": " Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36",
}
def run(self):
print("当前处理页码的任务为:{}".format(self.thread_name))
while not page_flag:
# Queue队列在put或者get的时候,需要设置block
# 默认为True,则在队列为空的时候会暂停进行等待,需要设置为false
# 当前队列没有数据或者数据已满的时候会抛出异常,empty或者full
try:
page = self.page_queue.get(block=False)
page_url = 'https://movie.douban.com/top250?start=' + str(page) + '&filter='
print("当前url为:{}".format(page_url))
# 设置代理
# proxy = {
# 'http': 'http://HIIH05A84N30E6ED:1A6A62A96423AFB9@http-dyn.abuyun.com:9020',
# 'https': 'https://HIIH05A84N30E6ED:1A6A62A96423AFB9@http-dyn.abuyun.com:9020',
# }
# 请求网页
response = requests.get(url=page_url, headers=self.header)
response.encoding = 'gbk'
# 将请求到的文本文件放入数据列表中
self.data_queue.put(response.text)
except:
pass
# 文本处理类
class Crawl_html(Thread):
# 重写父类方法
def __init__(self, thread_name, data_queue, lock):
super(Crawl_html, self).__init__()
self.thread_name = thread_name
self.data_queue = data_queue
self.lock = lock
def run(self):
print("当前处理文本的线程为:{}".format(self.thread_name))
while not data_flag:
try:
text = self.data_queue.get(block=False)
result = self.parse(text)
while self.lock:
douban_mongo.insert(result)
except:
pass
def parse(self, text1):
# 将html实例化
douban_html = etree.HTML(text1)
all_message = douban_html.xpath('//li/div[@class="item"]')
douban_list = []
for item in all_message:
info = {
'movie_name': item.xpath('.//img/@alt'),
'actors_information': item.xpath('.//p/text()'),
'score': item.xpath('.//span[@class="rating_num"]'),
'evaluate': item.xpath('.//span[@class="inq"]'),
'from_url': item.xpath('.//a/@href')
}
douban_list.append(info)
return douban_list
def main():
# 定义两个队列,存放页码的队列,存放文本的队列
page_queue = Queue()
data_queue = Queue()
# 定义一个锁
lock = threading.Lock()
# 将页码存入到页码队列中
for page in range(0, 250, 25):
page_queue.put(page)
print("当前页码的总量为:{}".format(page_queue.qsize()))
# 开启三个线程,用来处理页码数据
page_list = ["页码线程1号", "页码线程2号", "页码线程3号"]
page_thread_list = []
for thread_page_name in page_list:
thread_page = Crawl_page(thread_page_name, page_queue, data_queue)
thread_page.start()
page_thread_list.append(thread_page)
# 开启三个线程,用来处理文本数据
text_list = ["文本线程1号", "文本线程2号", "文本线程3号"]
text_thread_list = []
for thread_text_name in text_list:
thread_text = Crawl_html(thread_text_name, data_queue, lock)
thread_text.start()
text_thread_list.append(thread_text)
# 设置线程退出机制
global page_flag
while not page_queue.empty():
pass
page_flag = True
# 结束页码处理线程
for page_thread_join in page_thread_list:
page_thread_join.join()
print(page_thread_join.thread_name, "已经停止")
# 设置线程退出机制
global data_flag
while not data_queue.empty():
pass
data_flag = True
# 结束页码处理线程
for text_thread_join in text_thread_list:
text_thread_join.join()
print(text_thread_join.thread_name, "已经停止")
if __name__ == '__main__':
main()
正在回答 回答被采纳积分+1
相似问题
登录后可查看更多问答,登录/注册
- 参与学习 人
- 提交作业 107 份
- 解答问题 1672 个
Python最广为人知的应用就是爬虫了,有趣且酷的爬虫技能并没有那么遥远,本阶段带你学会利用主流Scrapy框架完成爬取招聘网站和二手车网站的项目实战。
了解课程
恭喜解决一个难题,获得1积分~
来为老师/同学的回答评分吧
0 星