Python 爬取考研调剂计划余额信息
今年由于疫情原因,考研复试、调剂纷纷推迟,时至 5 月 20 日,才开通考研调剂系统
但是调剂信息量非常大,毕竟中国大学多到数不清,而且一所学校不止一条调剂信息,可想而知,信息量有多大。虽然系统有一部分筛选条件,但是这些筛选条件可能依然不能满足需求,这就需要把所有可能需要的数据爬取下来,进行进一步的筛选。
1 前期工作
- 打开研招网,并进行登录:https://yz.chsi.com.cn/
- 点击网上调剂
- 此时就可以根据条件进行查找调剂信息
- 在专业的地方可以输入你想要查询的专业,比如“数学”,然后如图依次进行操作即可
- 点击消息头,查看请求网址、方法和请求头(包括 cookies)
- 点击参数,查看表单数据
2 代码编写
import json
import requests
def find_school(start, zymc):
headers = {
"Host": "yz.chsi.com.cn",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0",
"Accept": "*/*",
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"Accept-Encoding": "gzip, deflate, br",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"X-Requested-With": "XMLHttpRequest",
"Content-Length": "123",
"Origin": "https://yz.chsi.com.cn",
"Connection": "keep-alive",
"Referer": "https://yz.chsi.com.cn/sytj/tj/qecx.html",
"Cookie": "JSESSIONID=978BA64E6F4530EDB50FE43A3C805EBE; _ga=GA1.3.1200733016.1569507824; zg_did=%7B%22did%22%3A+%2216d6df4d287815-0a7df95b3ba0b78-4c312272-144000-16d6df4d288505%22%7D; zg_adfb574f9c54457db21741353c3b0aa7=%7B%22sid%22%3A+1590297728210%2C%22updated%22%3A+1590298426969%2C%22info%22%3A+1589874544864%2C%22superProperty%22%3A+%22%7B%7D%22%2C%22platform%22%3A+%22%7B%7D%22%2C%22utm%22%3A+%22%7B%7D%22%2C%22referrerDomain%22%3A+%22www.baidu.com%22%2C%22landHref%22%3A+%22https%3A%2F%2Fyz.chsi.com.cn%2F%22%2C%22cuid%22%3A+%226dc3b88f096e00fec19e8c56fc31b1e3%22%7D; _ga=GA1.4.1200733016.1569507824; zg_0d76434d9bb94abfaa16e1d5a3d82b52=%7B%22sid%22%3A+1569508637357%2C%22updated%22%3A+1569510245680%2C%22info%22%3A+1569508637361%2C%22superProperty%22%3A+%22%7B%7D%22%2C%22platform%22%3A+%22%7B%7D%22%2C%22utm%22%3A+%22%7B%7D%22%2C%22referrerDomain%22%3A+%22my.chsi.com.cn%22%2C%22cuid%22%3A+%226dc3b88f096e00fec19e8c56fc31b1e3%22%7D; acw_tc=2760827b15879041262595723ec515fe39010551ca912d1067523ce103dcbf; _gid=GA1.3.770260242.1589874545; __utma=229973332.1200733016.1569507824.1589904202.1589904202.1; __utmz=229973332.1589904202.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); aliyungf_tc=AQAAAGUbgAPRoQoA1Qjjb9p0NlS6rwTj; JSESSIONID=386B18A445F1FE086FAB8E0CD8865D8B; XSRF-CCKTOKEN=3514cccbe2916ac49c3a4b561173a314; CHSICC_CLIENTFLAGYZ=7013645e7a0bb77e53e19b04fd77cf6a; CHSICC_CLIENTFLAGSYTJ=43fdc33b0ff004fe35738549a78838c4",
"Pragma": "no-cache",
"Cache-Control": "no-cache"
}
data = {
"pageSize": "20",
"start": start, # 翻页
"orderBy": "",
"ssdm": "",
"dwmc": "",
"xxfs": "1",
"zymc": zymc, # 专业名称
"qers": "",
"data_type": "json",
"agent_from": "web",
"pageid": "tj_qe_list"
}
url = "https://yz.chsi.com.cn/sytj/stu/sytjqexxcx.action"
resp = requests.post(url, headers=headers, data=data)
# 学校列表
school_list = json.loads(resp.text)["data"]["vo_list"]["vos"]
return school_list
all_school_list = []
# 专业名称代码
zymc_list = ["数学", "图论及其应用", "运筹学与控制论", "应用数学", "概率论与数理统计", "计算数学", "几何与代数", "控制论与智能优化", "数理统计", "大数据分析与应用", "应用统计和数据分析"]
for zymc in zymc_list:
for i in range(20):
# 翻页
start = str(i * 20) if i > 0 else ""
school_list = find_school(start, zymc)
all_school_list += school_list
if len(school_list) < 20:
break
print(all_school_list)
说明:
“qers”: “余额人数”,
“fbsjStr”: “发布时间”,
“hasit”: “考生是否已经填报该志愿 true 或 false”,
“dwmc”: “单位名称”,
“yxsmc”: “院系所名称”,
“yjfxdm”: “研究方向代码”,
“zymc”: “专业名称”,
“zydm”: “专业代码”,
“dwdm”: “单位代码”,
“xxfs”: “学习方式”,
“sfmzyq”: “是否满足要求,空为满足要求,非空其内容为不满足要求原因”,
“bz”: “调剂特殊要求”,
“gxsj”: “距离最后更新时间已过 xx 分钟”,
“yjfxmc”: “研究方向名称”,
“zt”: “余额状态”,
“id”: “余额信息 ID”,
“yxsdm”: “院系所代码”,
“ssdm”: “省市代码”
运行结果:
[{
'qers': 0,
'fbsjStr': '',
'hasit': False,
'dwmc': '沈阳航空航天大学',
'yxsmc': '理学院',
'yjfxdm': '00',
'zymc': '数学',
'zydm': '070100',
'dwdm': '10143',
'xxfs': '1',
'sfmzyq': '',
'bz': '详见研究生院官网学院调剂公告',
'gxsj': 1812,
'yjfxmc': '不区分研究方向',
'zt': '只公布有计划余额',
'id': 'glygw21dkpjb7vlj',
'yxsdm': '012',
'ssdm': ''
},
... ... ... ...
{
'qers': 0,
'fbsjStr': '',
'hasit': False,
'dwmc': '宝鸡文理学院',
'yxsmc': '数学与信息科学学院',
'yjfxdm': '00',
'zymc': '计算数学',
'zydm': '070102',
'dwdm': '10721',
'xxfs': '1',
'sfmzyq': '',
'bz': '本科修读专业原则上为数学、应用数学、信息与计算科学、概率统计等专业。',
'gxsj': 146,
'yjfxmc': '不区分研究方向',
'zt': '只公布有计划余额',
'id': 'opiep577yqad0xx9',
'yxsdm': '009',
'ssdm': ''
}]
非常好的帖子,也给出实用,~~