运行下载的示例程序怎么抓取不到内容?
老师:运行下载的示例程序handle_guazi_task.py怎么抓取不到内容?仍然显示“正在打开中,请稍后...”代码如下。还有就是如何在代码中打印cookie,代码中的
print(cookie.name)
print(cookie.value)
怎么打印出来与
print(header['Cookie'])
不一致?求解,谢谢!
import requests
# 通过execjs这个包,来解析js
import execjs
import re
from handle_mongo import mongo
# 我们请求城市的接口
url = 'https://www.guazi.com/www/buy'
# cookie值要删掉,否则对方会根据这个值发现我们,并且屏蔽我们
# 要通过正则表达式处理请求头,里面有空格,大家一定要注意
header = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
"Host": "www.guazi.com",
"Referer": "https://www.guazi.com/www/buy",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3610.2 Safari/537.36",
}
response = requests.get(url=url, headers=header)
# 设置返回的编码
response.encoding = 'utf-8'
if '正在打开中,请稍后' in response.text:
# 通过正则表达式获取了相关的字段和值
value_search = re.compile(r"anti\('(.*?)','(.*?)'\);")
string = value_search.search(response.text).group(1)
key = value_search.search(response.text).group(2)
# 读取,我们破解的js文件
with open('guazi.js', 'r', encoding='UTF-8') as f:
f_read = f.read()
# 使用execjs包来封装这段JS,传入的是读取后的js文件
js = execjs.compile(f_read)
js_return = js.call('anti', string, key)
print(js_return)
cookie_value = 'antipas=' + js_return
header['Cookie'] = cookie_value
response_second = requests.get(url=url, headers=header)
print(header['Cookie'])
for cookie in response_second.cookies:
print(cookie.name)
print(cookie.value)
print("=========")
print(response.text)
city_search = re.compile(r'href="\/(.*?)\/buy"\stitle=".*?">(.*?)\s+</a>')
brand_search = re.compile(r'href="\/www\/(.*?)\/c-1/#bread"\s+>(.*?)</a>')
city_list = city_search.findall(response_second.text)
brand_list = brand_search.findall(response_second.text)
for city in city_list:
if city[1] == '北京':
for brand in brand_list:
info = {}
# https://www.guazi.com/anqing/buy
# https://www.guazi.com/anqing/audi/#bread
# https://www.guazi.com/anqing/audi/o1i7/#bread
info['task_url'] = 'https://www.guazi.com/' + city[0] + '/' + brand[0] + '/' + 'o1i7'
info['city_name'] = city[1]
info['brand_name'] = brand[1]
info['item_type'] = 'list_item'
# 保存到mongodb里面去
mongo.save_task('guazi_task', info)
19
收起
正在回答 回答被采纳积分+1
2回答
时间,
2020-12-22 17:07:57
同学,你好,
1、header中的Cookie和cookies是不一样的,同学直接输出header['Cookie']即可
2、二手车网站的js进行了重新编译,同学可根据下述代码获取信息
import requests
# 通过execjs这个包,来解析js
import execjs
import re
import json
from guazi_scrapy_project.handle_mongo import mongo
# 我们请求城市的接口
url = 'https://www.guazi.com/www/buy'
# cookie值要删掉,否则对方会根据这个值发现我们,并且屏蔽我们
# 要通过正则表达式处理请求头,里面有空格,大家一定要注意
header = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
"Host": "www.guazi.com",
"Referer": "https://www.guazi.com/www/buy",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3610.2 Safari/537.36",
}
response = requests.get(url=url, headers=header)
# 设置返回的编码
response.encoding = 'utf-8'
if '正在打开中,请稍后' in response.text:
# 通过正则表达式获取了相关的字段和值
value_search = re.compile(r"anti\('(.*?)','(.*?)'\);")
string = value_search.search(response.text).group(1)
key = value_search.search(response.text).group(2)
# print(string, key )
# 读取,我们破解的js文件
with open('guazi.js', 'r') as f:
f_read = f.read()
# 使用execjs包来封装这段JS,传入的是读取后的js文件
js = execjs.compile(f_read)
js_return = js.call('anti', string, key)
cookie_value = 'antipas=' + js_return
header['Cookie'] = cookie_value
response_second = requests.get(url=url, headers=header)
city_search = re.compile(r'({.*?});')
brand_search = re.compile(r'href="\/www\/(.*?)\/c-1/#bread"\s+>(.*?)</a>')
city_list = city_search.findall(response_second.text)
# print(city_list)
brand_list = brand_search.findall(response_second.text)
# print(brand_list)
for city_info in city_list[2:]:
# print(json.loads(city_info))
for k, v in json.loads(city_info).items():
# print(v)
for city in v:
# print(city['name'])
if city['name'] == '北京':
for brand in brand_list:
info = {}
# https://www.guazi.com/anqing/buy
# https://www.guazi.com/anqing/audi/#bread
# https://www.guazi.com/anqing/audi/o1i7/#bread
info['task_url'] = 'https://www.guazi.com/' + city['domain'] + '/' + brand[0] + '/' + 'o1i7'
info['city_name'] = city['name']
info['brand_name'] = brand[1]
info['item_type'] = 'list_item'
# print(info)
# 保存到mongodb里面去
mongo.save_task('guazi_task', info)
4.入门主流框架Scrapy与爬虫项目实战
- 参与学习 人
- 提交作业 107 份
- 解答问题 1672 个
Python最广为人知的应用就是爬虫了,有趣且酷的爬虫技能并没有那么遥远,本阶段带你学会利用主流Scrapy框架完成爬取招聘网站和二手车网站的项目实战。
了解课程
恭喜解决一个难题,获得1积分~
来为老师/同学的回答评分吧
0 星