82行打印出来乱码, 要怎么解决

82行打印出来乱码, 要怎么解决

# coding: utf-8

import time
import requests
from lxml import etree
import pymongo
from queue import Queue
import threading


class PageThread(threading.Thread):

def __init__(self, thread_name, page_queue, detail_queue):
threading.Thread.__init__(self)
self.thread_name = thread_name
self.page_queue = page_queue
self.detail_queue = detail_queue

def run(self):
while not self.page_queue.empty():
page_url = self.page_queue.get_nowait()
page_html = get_html_by_url(page_url)
detail_html_list = page_html.xpath("//ol[@class='grid_view']/li")
for i, detail_html in enumerate(detail_html_list):
if i > 2:
break
self.detail_queue.put(detail_html)


class DetailThread(threading.Thread):

def __init__(self, thread_name, detail_queue, con, lock):
threading.Thread.__init__(self)
self.thread_name = thread_name
self.detail_queue = detail_queue
self.con = con
self.lock = lock

def run(self):
while not self.detail_queue.empty():
detail_html = self.detail_queue.get_nowait()
info_html = detail_html.xpath('//div[@class="info"]')
movie_dict = {
'movie_name': list_join(info_html.xpath("./div[@class='hd']/a/span/text()")),
'actors_information': list_join(info_html.xpath("./div[@class='bd']/p[1]/text()")),
'score': list_join(info_html.xpath("./div[@class='bd']/div/span[@class='rating_num']/text()")),
'evaluate': list_join(info_html.xpath("./div[@class='bd']/div/span[last()]/text()")),
'describe': list_join(info_html.xpath("./div[@class='bd']//span[@class='inq']"))
}
print(movie_dict)
# with self.lock:
# self.con.insert_one(movie_dict)


def list_join(l):
return ''.join(l)


def get_html_by_url(url):
print(url)
header = {
'Host': 'movie.douban.com',
'Connection': 'keep-alive',
'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
'sec-ch-ua-mobile': '?0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Referer': 'https://movie.douban.com/top250?filter=unwatched',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9'
}
response = requests.get(url=url, headers=header, timeout=3)
with open('douban.html', 'wb') as f:
t = response.content
f.write(t)
text = response.text
print(text)
html = etree.HTML(response.text)
html_str = etree.tostring(html).decode()
return html


def main():
page_queue = Queue()
detail_queue = Queue()

con = pymongo.MongoClient(host='127.0.0.1', port=1112)['douban']['movie250']

lock = threading.Lock()

# for i in range(0, 100, 25):
# for i in range(0, 50, 25):
# url = f'https://movie.douban.com/top250?start={i}'
# page_queue.put(url)

url = f'https://movie.douban.com/top250?start=0'
page_queue.put(url)

page_thread_names = ['页面线程1', '页面线程2', '页面线程3']
page_thread_list = []
for page_thread_name in page_thread_names:
page_thread = PageThread(page_thread_name, page_queue, detail_queue)
page_thread.start()
page_thread_list.append(page_thread)

while not page_queue.empty():
pass

for page_thread in page_thread_list:
if page_thread.is_alive():
page_thread.join()

detail_thread_names = ['详情线程1', '详情线程2', '详情线程3', '详情线程4', '详情线程5']
detail_thread_list = []
for detail_thread_name in detail_thread_names:
detail_thread = DetailThread(detail_thread_name, detail_queue, con, lock)
detail_thread.start()
detail_thread_list.append(detail_thread)

while not detail_queue.empty():
pass

for detail_thread in detail_thread_list:
if detail_thread.is_alive():
detail_thread.join()


if __name__ == '__main__':
start_time = time.time()
main()
end_time = time.time()
print('耗时{}秒'.format(end_time - start_time))


正在回答 回答被采纳积分+1

登陆购买课程后可参与讨论,去登陆

1回答
好帮手慕燕燕 2021-08-13 18:42:30

同学,你好!同学的代码中存在以下问题:

1、info_html = detail_html.xpath('//div[@class="info"]')获取到的是列表,包含25条数据,要使用循环来获取其中每一条的电影名称、评分等信息

http://img1.sycdn.imooc.com//climg/6116498209578dd614890705.jpg

2、xpath获取的电影名称、评分、演职人员信息中包含空格换行等特殊字符,可以使用正则表达式将特殊字符替换掉

3、moive_name和describe xpath语法不正确,可参考以下代码

http://img1.sycdn.imooc.com//climg/61164c8a09a5071a12880529.jpg

祝:学习愉快!

  • 提问者 大可计划 #1

    老师可以拿我的代码运行一下吗,82行会打印出一堆乱码,82行打印的是列表页面的html,我还是比较想知道,怎么解决这个乱码问题
    老师提到的其他问题,后续我会修改

    2021-08-13 23:50:46
  • 提问者 大可计划 #2

    这些其他问题, 我还没调试, 现在最关键的问题是

    https://movie.douban.com/top250?start=0

    第一个请求, 返回的文本(在82行有打印)就是乱码, 所以, 之后的代码, 我还没调试
    老师可以先拿我代码去执行一下吗, 看下82行打印的为什么是乱码

    2021-08-14 08:52:36
  • 好帮手慕燕燕 回复 提问者 大可计划 #3

    同学,你好!将header中的Accept-Encoding注释掉,看数据是否正常

    http://img1.sycdn.imooc.com//climg/61171de9096b905207490458.jpg

    祝:学习愉快!​

    2021-08-14 09:36:52
问题已解决,确定采纳
还有疑问,暂不采纳

恭喜解决一个难题,获得1积分~

来为老师/同学的回答评分吧

0 星
请稍等 ...
意见反馈 帮助中心 APP下载
官方微信

在线咨询

领取优惠

免费试听

领取大纲

扫描二维码,添加
你的专属老师