正在回答 回答被采纳积分+1
2回答
好帮手乔木
2020-03-20 14:29:41
同学你好:
当前的网站对js文件进行了重新编译。
import json
import requests
import execjs
import re
from lxml import etree
from handle_mongo import mongo
url = 'https://www.guazi.com/anji/buy/'
header = {
"ccept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Host": "www.guazi.com",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36",
}
response = requests.get(url, headers=header)
response.encoding = 'utf-8'
if '正在打开中,请稍后...' in response.text:
value_pattern = re.compile("anti\('(.*?)','(.*?)'\)")
string = value_pattern.search(response.text).group(1)
key = value_pattern.search(response.text).group(2)
# 读取获取的js
with open('guazi.js', 'r') as f:
file_js = f.read()
js = execjs.compile(file_js)
js_return = js.call('anti', string, key)
cookie_value = 'antipas=' +js_return
header['Cookie'] = cookie_value
response = requests.get(url, headers=header)
# print(response.text)
guazi_html = etree.HTML(response.text)
# 通过js获取城市名称
script_js = guazi_html.xpath("//script[3]/text()")[0]
city_search = re.compile(r'({.*?});')
city = city_search.findall(script_js)
# cityLeft获取城市的中文和英文名
cityOne = json.loads(city[0])
cityTwo = json.loads(city[1])
A_N = [chr(i) for i in range(65, 78)]
M_Z = [chr(i) for i in range(78, 91)]
all_city = []
for i in A_N:
# 根据获取相同首字母的城市列表
each_list1 = cityOne.get(i)
if each_list1:
all_city.append(each_list1)
for i in M_Z:
each_list2 = cityTwo.get(i)
if each_list2:
all_city.append(each_list2)
brand_list = guazi_html.xpath('//div[@class="dd-all clearfix js-brand js-option-hid-info"]//a')
info_list = []
brand_url = re.compile(r'\/.*?\/(.*?)\/#bread')
#所有的城市
print(all_city)
for item in brand_list:
bl = brand_url.search(item.xpath('./@href')[0]).group(1)
#所有的品牌名称
print(bl)如果我解决了同学的问题,请采纳!学习愉快^_^。
4.入门主流框架Scrapy与爬虫项目实战
- 参与学习 人
- 提交作业 107 份
- 解答问题 1672 个
Python最广为人知的应用就是爬虫了,有趣且酷的爬虫技能并没有那么遥远,本阶段带你学会利用主流Scrapy框架完成爬取招聘网站和二手车网站的项目实战。
了解课程

恭喜解决一个难题,获得1积分~
来为老师/同学的回答评分吧
0 星