[Python] 高考报志愿愁死人,爬点数据助送大学
最近高考出分,填报志愿愁死人,抓取数据帮好好分析分析去哪个大学最合适。
# -- coding:UTF-8 --
import json as j
import time
import csv
from fake_useragent import UserAgent
import requests
import random
ua = UserAgent()
us = ua.random
url = "https://api.eol.cn/gkcx/api/?"
headers = {
"Host": "api.eol.cn",
"Referer": "https://gkcx.eol.cn/school/search",
"User-Agent": f"{us}"
}
# 请求头部以及请求载荷
def request_school(page):
reque_pay = {
"access_token": "",
"admissions": "",
"central": "",
"department": "",
"dual_class": "",
"f211": "",
"f985": "",
"is_doublehigh": "",
"is_dual_class": "", "keyword": "",
"nature": "",
"page": f"{page}",
"province_id": "",
"ranktype": "",
"request_type": 1,
"school_type": "",
"size": 20,
"sort": "view_total",
"top_school_id": "[766]",
"type": "",
"uri": "apidata/api/gk/school/lists"
}
try:
open_url = requests.post(url, data=j.dumps(reque_pay), headers=headers)
if open_url.status_code == 200:
return open_url.json()
except requests.ConnectionError as e:
print("error", e.args)
# 基本信息
def news(json):
i=random.randint(3,10)
time.sleep(i)
if json:
items = json.get("data")
items_new = items.get("item")
print(items_new)
for i in items_new:
news_school = {}
news_school["学校id"] = i.get("school_id")
news_school["名字"] = i.get("name")
# news_school["人气值"] = i.get("view_total")
news_school["类型"] = i.get("type_name")
news_school["科类"] = i.get("level_name")
news_school["级别"] = i.get(
"dual_class_name") + "|" + i.get("nature_name")
news_school["位置"] = i.get("address")
news_school["招生咨询网站"] = i.get("answerurl")
yield news_school
# 分数线与专业线
def math(lst, match_year, subject, province):
math_headers = {
"Host": "static-data.eol.cn",
"Origin": "https://gkcx.eol.cn",
"Referer": f"https://gkcx.eol.cn/school/{lst}/provinceline",
"User-Agent": F"{us}"
}
math_url = F"https://static-data.eol.cn/www/2.0/schoolprovinceindex/{match_year}/{lst}/{province}/{subject}/1.json"
try:
math_request = requests.get(math_url, headers=math_headers)
if requests.status_codes == 200:
return math_request.json()
except requests.ConnectionError as m:
print("errp" + m.args)
try:
math_get = math_request.json().get("data").get("item")
for a_match in math_get:
math_data = {}
math_data["学校名字"] = school_name
math_data["年份"] = a_match.get("year")
math_data["录取批次"] = a_match.get("local_batch_name")
math_data["招生类型"] = a_match.get("zslx_name")
math_data["最低分/最低位次"] = a_match.get("min") + \
"/" + a_match.get("min_section")
math_data["省控线"] = a_match.get("proscore")
print("正在获取")
yield math_data
except AttributeError:
print(school_name + ":" + "暂时还没有其内容")
# 招生计划
def Enrollment_plan(lst, match_year, subject, province, E_batch):
math_headers = {
"Host": "static-data.gaokao.cn",
"Origin": "https://www.gaokao.cn",
"Referer": f"https://www.gaokao.cn/school/{lst}/provinceline",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3870.400 QQBrowser/10.8.4405.400"
}
E_url = F"https://static-data.gaokao.cn/www/2.0/schoolplanindex/{match_year}/{lst}/{province}/{subject}/{E_batch}/1.json"
try:
E_request = requests.get(E_url, headers=math_headers)
if requests.status_codes == 200:
return E_request.json()
except requests.ConnectionError as m:
print("errp" + m.args)
try:
E_get = E_request.json().get("data").get("item")
for E_match in E_get:
E_data = {}
E_data["学校名字"] = school_name
E_data["专业名称"] = E_match.get("spname")
E_data["学科门类"] = E_match.get("level2_name")
E_data["计划招生"] = E_match.get("num")
E_data["学制"] = E_match.get("length")
yield E_data
except AttributeError:
print(school_name + ":" + "暂时还没有其内容")
print(E_url)
# 专业分数线
def Professional_score_line(lst, match_year, subject, province, E_batch):
# 7代表本科一批
# 6代表本科提取批
# 10代表专科批
math_headers = {
"Host": "static-data.eol.cn",
"Origin": "https://gkcx.eol.cn",
"Referer": f"https://gkcx.eol.cn/school/{lst}/provinceline",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3870.400 QQBrowser/10.8.4405.400"
}
E_url = F"https://static-data.eol.cn/www/2.0/schoolplanindex/{match_year}/{lst}/{province}/{subject}/{E_batch}/1.json"
try:
R_request = requests.get(E_url, headers=math_headers)
if requests.status_codes == 200:
return R_request.json()
except requests.ConnectionError as m:
print("errp" + m.args)
try:
R_get = R_request.json().get("data").get("item")
for R_match in R_get:
R_data = {}
R_data["学校名字"] = school_name
R_data["专业名称"] = R_match.get("spname")
R_data["录取批次"] = R_match.get("level1_name")
R_data["招生类型"] = R_match.get("zslx_name")
R_data["平均分"] = R_match.get("average")
yield R_data
except AttributeError:
print(school_name + ":" + "暂时还没有其内容")
# 第二单元
def manu(choose, data_id):
lst = SCHOOL
if choose == 1:
match_year = 2021 # (可修改)
subject = 2 # (1 = 理科 ) (2 = 文科) (可修改)
province = 15 # (在id.txet中自己查询) (可修改)
elif choose != 1:
match_year = match_yearS # (可修改)
subject = subjectS # (1 = 理科 ) (2 = 文科) (可修改)
province = provinceS # (在id.txet中自己查询) (可修改)
mat = math(lst, match_year, subject, province) # ok
en = Enrollment_plan(lst, match_year, subject, province, E_batch) # ok
pr = Professional_score_line(
lst, match_year, subject, province, E_batch) # ok
if choose == 1:
return data_id
elif choose == 2:
for one in mat:
return one
elif choose == 3:
for two in en:
return two
elif choose == 4:
for three in pr:
return three
def save_josn(result, choose):
if choose == 1:
name_xls = "school_jiben"
elif choose == 2:
name_xls = "mat"
elif choose == 3:
name_xls = "Enrollment_plan"
elif choose == 4:
name_xls = "Professional_score_line"
else:
print("出错了")
open_file = open(f"{name_xls}.json", mode="a+", encoding="utf-8")
j.dump(result, open_file, ensure_ascii=False, indent=4)
open_file.close()
# xls数据保存(待完成)
def save_data(choose,data):
# 创建excel工作表
if choose == 1:
name_xls = "school_jiben"
elif choose == 2:
name_xls = "mat"
elif choose == 3:
name_xls = "Enrollment_plan"
elif choose == 4:
name_xls = "Professional_score_line"
else:
print("出错了")
if data:
with open(f'{name_xls}.csv', mode='a', newline='', encoding='utf8') as cfa:
wf = csv.writer(cfa)
wf.writerow(data.values())
if __name__ == "__main__":
data_a=[]
print("城市参数在ID里面")
print("获取基本信息输入: 1")
print("获取省内分数线输: 2")
print("获取计划招生输入: 3")
print("获取专业线分数输:4")
choose = int(input("请输入查询数据:"))
if choose != 1:
try:
match_yearS = input("输入查询年份:") # (可修改)
subjectS = input("选择学科:") # (1 = 理科 ) (2 = 文科) (可修改)
provinceS = input("输入查询省份ID:") # (在id.txet中自己查询) (可修改)
except (ValueError, UnboundLocalError):
print("\033[0:31m 不要乱输入哦\033[0m")
exit()
print("\033[0:32m 获取成功,爬虫正在爬取,等待一会查看目录下文件即可\033[0m")
else:
print("\033[0:32m 获取成功,爬虫正在爬取,等待一会查看目录下文件即可\033[0m")
for page in range(1, 143): # 更改获取页数,一共143页根据自己需求去获取
json = request_school(page)
rf = news(json)
for data_id in rf:
SCHOOL = data_id.get("学校id")
school_name = data_id.get("名字")
# 招生计划,分数线本科与专科判断
benke_judge = "普通本科" in data_id.values()
if benke_judge == True:
#一本7 二本8 专科10
E_batch = 7
elif benke_judge == False:
E_batch = 10
else:
print(school_name + "还没有其内容")
data = manu(choose, data_id)
save_josn(data, choose)
save_data(choose, data)[Python] 高考历年分数最低查询
#encoding :utf-8
import requests
import json
import csv
import os
import pandas as pd
import openpyxl
pro_id = {
'11':'北京市',
'12':'天津市',
'13':'河北省',
'14':'山西省',
'15':'内蒙古自治区',
'21':'辽宁省',
'22':'吉林省',
'23':'黑龙江省',
'31':'上海市',
'32':'江苏省',
'33':'浙江省',
'34':'安徽省',
'35':'福建省',
'36':'江西省',
'37':'山东省',
'41':'河南省',
'42':'湖北省',
'43':'湖南省',
'44':'广东省',
'45':'广西壮族自治区',
'46':'海南省',
'50':'重庆市',
'51':'四川省',
'52':'贵州省',
'53':'云南省',
'54':'西藏自治区',
'61':'陕西省',
'62':'甘肃省',
'63':'青海省',
'64':'宁夏回族自治区',
'65':'新疆维吾尔自治区',
'71':'台湾省',
'81':'香港特别行政区',
'82':'澳门特别行政区',
}
typ_dict = {'2073':'物理类',
'2074':'历史类',
'4':'艺术类',
'1':'理科',
'2': '文科',
'3':'综合',
}
batch_dict = {'7':'一本(广西、陕西)',
'8':'二本(广西、陕西)',
'14':'本科(广东、湖南)',
'1570':'普通类一段(山东)',
'1571':'普通类二段(山东)',
}
headers = {
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5'
}
# 获取单个学校id
def get_school_id(schuool_name):
school_id = -1
reque = 'https://api.eol.cn/gkcx/api/?keyword=%s&uri=apigkcx/api/school/hotlists' % (schuool_name)
re_json = requests.get(reque, headers = headers).text
re_dict = json.loads(re_json)
if 'code' in re_dict.keys():
if re_dict['code'] == '0000':
for name in re_dict['data']['item']:
if name['name'] == schuool_name:
school_id = name['school_id']
# print(school_id,type(school_id))
return school_id
# 通过年份获取历年成绩
def get_year_grade(year:int, school_id:int, province_id:int, typ:int ,batch:int):
# 年份、学校ID、招生城市、文科/理科、批次
## typ 理科/文科 1/2
## province 录取的省份
# item_dist = {'local_batch_name':0,
# 'min':0,
# 'min_section':0,
# 'spname':0,
# }
item_list = []
# 针对广西
# if batch == 1: # 一本
# batch = 7
# elif batch == 2:
# batch = 8 # 二本
# else:
# batch = 10 # 转科
for num in range(1,999):
''' https://static-data.gaokao.cn/www/2.0/schoolspecialindex/2021/3593/45/2/10/1.json'''
url = 'https://static-data.gaokao.cn/www/2.0/schoolspecialindex/%d/%d/%d/%d/%d/%d.json' % (year, school_id, province_id, typ, batch, num)
re_json = requests.get(url, headers = headers).text
if re_json == '''""''':
break
re_dict = json.loads(re_json)
if 'code' in re_dict.keys():
for item in re_dict['data']['item']:
item_list.append(item)
return item_list
# 获取省份的全部学校信息(包含id)
def get_province_all(province_id:int):
'''{"address":"广西壮族自治区北海市银海区金海岸大道666号","admissions":2,"answerurl":"","belong":"广西壮族自治区教育厅","central":2,"city_id":4505,"city_name":"北海市","code_enroll":1476500,"colleges_level":"","county_id":450503,"county_name":"银海区","department":2,"doublehigh":0,"dual_class":38003,"dual_class_name":"","f211":2,"f985":2,"id":"gkschool3593","is_logo":1,"is_recruitment":1,"is_top":2,"level":2002,"level_name":"专科(高职)","name":"北海康养职业学院","nature":36001,"nature_name":"民办","province_id":45,"province_name":"广西","rank":2807,"rank_type":10,"school_id":3593,"school_type":6001,"single_province":"","special":[],"type":0,"type_name":"","view_month":"242","view_month_number":242,"view_total":"3692","view_total_number":3692,"view_week":"0","view_week_number":0,"view_year":3692}'''
item_list = []
for page in range(1,99):
url = 'https://api.eol.cn/web/api/?&page=%d&province_id=%d&uri=apidata/api/gk/school/lists' % (page, province_id)
re_json = requests.get(url, headers = headers).text
re_dict = json.loads(re_json)
# print(re_dict)
if 'code' in re_dict.keys():
if len(re_dict['data']['item']) != 0:
for item in re_dict['data']['item']:
item_list.append(item)
else:
break
return item_list
# https://api.eol.cn/web/api/?&page=8&province_id=45&uri=apidata/api/gk/school/lists
# 处理数据,获取学习名字和id
def return_all_school_id(school_info:list):
# dict_tem = {'name':'','school_id':''}
item_list = []
for school in school_info:
item_list.append({'name':school['name'], 'school_id':school['school_id']})
# dict_tem['name'] = school['name']
# dict_tem['school_id'] = school['school_id']
# item_list.append(dict_tem)
# print(item_list)
return item_list
# 提取成绩信息
def return_all_grade(grade_info:list):
# dict_tem = {}
item_list = []
# print(grade_info)
for school in grade_info:
item_list.append(
{
'local_batch_name': school['local_batch_name'],
'spname': school['spname'],
'min': school['min'],
'min_section': school['min_section'],
'type':school['type'],
'province':school['province'] ## 生源地
}
)
# print(type(school['type']))
# dict_tem['local_batch_name'] = grade_info['local_batch_name'] ## 批次
# dict_tem['spname'] = grade_info['spname'] ## 专业
# dict_tem['min'] = grade_info['min'] ## 最低分数
# dict_tem['min_section'] = grade_info['min_section'] ## 排名
# item_list.append(dict_tem)
return item_list
# 写csv文件
def write_csv(path:str,date:list):
if not len(date):
print('抱歉,啥都没有')
return
if os.path.exists(path):
file_ex = True
else:
file_ex = False
with open(path, 'a+', newline='', encoding='utf-8') as csvfile:
hea = list(date[0])
writer = csv.DictWriter(csvfile, fieldnames=hea)
if not file_ex:
writer.writeheader()
writer.writerows(date)
csv_excel = pd.read_csv(path, encoding='utf-8')
csv_excel.to_excel(path[:-4] + '.xlsx', sheet_name='data')
os.remove(path)
# 获取某个省份的全部学校录取某个省份的最低分
def get_test(year:int, province_src:int, province_trg:int, typ:int, batch:int, file_path:str):
# style_hea = {'学校':'','年份':'','批次':'','专业':'','最低分':'','最低排名':''}
school_info = return_all_school_id(get_province_all(province_src)) ## 获取了所有学校的名字和id
# print(school_info)
test_list = []
num = 0
for i in school_info:
num +=1
print(num)
gra = get_year_grade(year = year, school_id = i['school_id'], province_id = province_trg, typ = typ, batch = batch)
# print(gra)
gra = return_all_grade(gra)
# print(gra)
for g in gra:
test_list.append(
{
'学校' : i['name'],
'年份' : 2021,
'批次' : g['local_batch_name'],
'生源地': pro_id[str(g['province'])],
'类别': typ_dict[str(g['type'])],
'专业' : g['spname'],
'最低分': g['min'],
'最低排名' :g['min_section']
}
)
# print(g['type'])
# style_hea['学校'] = i['name']
# style_hea['年份'] = 2021
# style_hea['批次'] = g['local_batch_name']
# style_hea['专业'] = g['spname']
# style_hea['最低分'] = g['min']
# style_hea['最低排名'] = g['min_section']
# test_list.append(style_hea)
# print(test_list)
if file_path == '':
return test_list
write_csv(file_path,test_list)
def get_max_mean_min(grade_info:list):
item_list = []
# print(grade_info)
for school in grade_info:
item_list.append(
{
'批次': school['local_batch_name'],
'专业': school['spname'],
'最低分': school['min'],
'最高分': school['max'],
'平均分':school['average'],
'最低排名': school['min_section'],
# '理科':school['type'],
# 'province':school['province'] ## 生源地
}
)
for i in item_list:
print(i,'\n')
# 获取全国对某个省份的专业录取最低分
def get_test_1(year:int, province_trg:int, typ:int, batch:int, file_path):
# 年、目标省代码、类别(理科/文科/物理类/历史类...)、批次(本科一批次、二、三之类的)、保存路径
for id in pro_id.keys():
re = get_test(year, int(id), province_trg, typ,batch, '')
write_csv(file_path,re)
def get_test_2(year:int, school_id:int, typ:int, batch:int, file_path):
# test_list = []
# gra = get_year_grade(year, school_id, typ, batch)
# gra = return_all_grade(gra)
# for g in gra:
# test_list.append(
# {
# '学校' : i['name'],
# '年份' : 2021,
# '批次' : g['local_batch_name'],
# '生源地': pro_id[str(g['province'])],
# '类别': typ_dict[str(g['type'])],
# '专业' : g['spname'],
# '最低分': g['min'],
# '最低排名' :g['min_section']
# }
# )
# write_csv(file_path,test_list)
pass
if __name__ =='__main__':
# typ_dict = {'2073':'物理类',
# '2074':'历史类',
# '4':'艺术类',
# '1':'理科',
# '2': '文科',
# '3':'综合',
# }
# batch_dict = {'7':'一本(广西、陕西)',
# '8':'二本(广西、陕西)',
# '10':'专科',
# '14':'本科(广东、湖南)',
# '1570':'普通类一段(山东)',
# '1571':'普通类二段(山东)',
# '44': '本科二批A段',
# '45': '本科二批B段',
# '51': '本科一批A段',
# }
# id = get_school_id('武汉理工大学')
# if id != -1:
# print(get_year_grade(45,2021,id,1,1))
# ll = get_province_all(45)
# for l in ll:
# print(l['name'])
# print(os.path.exists(r'./123.csv'))
# date = [{'test_1':1,'test_2':2},{'test_1':2,'test_2':1}]
# write_csv(r'./123.csv',date)
# school_info = return_all_school_id(get_province_all())
# get_test()
get_test(2021, 45, 45, 2, 7,r'./20220627/2021年广西本科一批次专业最低分数线.csv')
get_test(2021, 45, 45, 2, 8,r'./20220627/2021年广西本科二批次专业最低分数线.csv')
get_test(2021, 44, 43, 2073, 14,r'./20220627/2021年广东(生源地:湖南)物理类本科批次专业最低分数线.csv')
get_test(2021, 45, 43, 2073, 14,r'./20220627/2021年广西(生源地:湖南)物理类本科批次专业最低分数线.csv')
get_test(2021, 36, 43, 2073, 14,r'./20220627/2021年江西(生源地:湖南)物理类本科批次专业最低分数线.csv')
get_test(2021, 44, 43, 2074, 14,r'./20220627/2021年广东(生源地:湖南)历史类本科批次专业最低分数线.csv')
get_test(2021, 45, 43, 2074, 14,r'./20220627/2021年广西(生源地:湖南)历史类本科批次专业最低分数线.csv')
get_test(2021, 36, 43, 2074, 14,r'./20220627/2021年江西(生源地:湖南)历史类本科批次专业最低分数线.csv')
get_test(2021, 14, 14, 1, 8,r'./20220627/2021年山西理科本科二批次专业最低分数线.csv')
get_test(2021, 45, 44, 2074, 14,r'./20220627/2021年广西(生源地:广东)历史类本科批次专业最低分数线.csv')
get_test(2021, 36, 44, 2074, 14,r'./20220627/2021年江西(生源地:广东)历史类本科批次专业最低分数线.csv')
get_test(2021, 43, 44, 2074, 14,r'./20220627/2021年湖南(生源地:广东)历史类本科批次专业最低分数线.csv')
get_test(2021, 65, 43, 2073, 14,r'./20220627/2021年新疆(生源地:湖南)物理类本科批次专业最低分数线.csv')
get_test(2021, 46, 43, 2073, 14,r'./20220627/2021年海南(生源地:湖南)物理类本科批次专业最低分数线.csv')
get_test(2021, 35, 43, 2073, 14,r'./20220627/2021年福建(生源地:湖南)物理类本科批次专业最低分数线.csv')
get_test(2021, 34, 43, 2073, 14,r'./20220627/2021年安徽(生源地:湖南)物理类本科批次专业最低分数线.csv')
get_test(2021, 61, 43, 2073, 14,r'./20220627/2021年陕西(生源地:湖南)物理类本科批次专业最低分数线.csv')
get_test(2021, 65, 65, 1, 7,r'./20220627/2021年新疆理科本科一批次专业最低分数线.csv')
get_test(2021, 65, 65, 1, 8,r'./20220627/2021年新疆文科本科二批次专业最低分数线.csv')
get_test(2021, 65, 65, 2, 7,r'./20220627/2021年新疆文科本科一批次专业最低分数线.csv')
get_test(2021, 65, 65, 2, 8,r'./20220627/2021年新疆文科本科二批次专业最低分数线.csv')
get_test(2021, 53, 53, 1, 7,r'./20220627/2021年云南理科本科一批次专业最低分数线.csv')
get_test(2021, 53, 53, 1, 8,r'./20220627/2021年云南理科本科二批次专业最低分数线.csv')
get_test(2021, 45, 45, 1, 10,r'./20220627/2021年广西理科专科批次专业最低分数线.csv')
get_test(2021, 22, 22, 1, 44,r'./20220627/2021年吉林理科本科二A段批次专业最低分数线.csv')
get_test(2021, 22, 22, 2, 44,r'./20220627/2021年吉林理文科本科二A段批次专业最低分数线.csv')
get_test(2021, 15, 15, 2, 8,r'./20220627/2021年内蒙古文科本科二批次专业最低分数线.csv')

