[Python] 高考报志愿愁死人,爬点数据助送大学
最近高考出分,填报志愿愁死人,抓取数据帮好好分析分析去哪个大学最合适。
# -- coding:UTF-8 -- import json as j import time import csv from fake_useragent import UserAgent import requests import random ua = UserAgent() us = ua.random url = "https://api.eol.cn/gkcx/api/?" headers = { "Host": "api.eol.cn", "Referer": "https://gkcx.eol.cn/school/search", "User-Agent": f"{us}" } # 请求头部以及请求载荷 def request_school(page): reque_pay = { "access_token": "", "admissions": "", "central": "", "department": "", "dual_class": "", "f211": "", "f985": "", "is_doublehigh": "", "is_dual_class": "", "keyword": "", "nature": "", "page": f"{page}", "province_id": "", "ranktype": "", "request_type": 1, "school_type": "", "size": 20, "sort": "view_total", "top_school_id": "[766]", "type": "", "uri": "apidata/api/gk/school/lists" } try: open_url = requests.post(url, data=j.dumps(reque_pay), headers=headers) if open_url.status_code == 200: return open_url.json() except requests.ConnectionError as e: print("error", e.args) # 基本信息 def news(json): i=random.randint(3,10) time.sleep(i) if json: items = json.get("data") items_new = items.get("item") print(items_new) for i in items_new: news_school = {} news_school["学校id"] = i.get("school_id") news_school["名字"] = i.get("name") # news_school["人气值"] = i.get("view_total") news_school["类型"] = i.get("type_name") news_school["科类"] = i.get("level_name") news_school["级别"] = i.get( "dual_class_name") + "|" + i.get("nature_name") news_school["位置"] = i.get("address") news_school["招生咨询网站"] = i.get("answerurl") yield news_school # 分数线与专业线 def math(lst, match_year, subject, province): math_headers = { "Host": "static-data.eol.cn", "Origin": "https://gkcx.eol.cn", "Referer": f"https://gkcx.eol.cn/school/{lst}/provinceline", "User-Agent": F"{us}" } math_url = F"https://static-data.eol.cn/www/2.0/schoolprovinceindex/{match_year}/{lst}/{province}/{subject}/1.json" try: math_request = requests.get(math_url, headers=math_headers) if requests.status_codes == 200: return math_request.json() except requests.ConnectionError as m: print("errp" + m.args) try: math_get = math_request.json().get("data").get("item") for a_match in math_get: math_data = {} math_data["学校名字"] = school_name math_data["年份"] = a_match.get("year") math_data["录取批次"] = a_match.get("local_batch_name") math_data["招生类型"] = a_match.get("zslx_name") math_data["最低分/最低位次"] = a_match.get("min") + \ "/" + a_match.get("min_section") math_data["省控线"] = a_match.get("proscore") print("正在获取") yield math_data except AttributeError: print(school_name + ":" + "暂时还没有其内容") # 招生计划 def Enrollment_plan(lst, match_year, subject, province, E_batch): math_headers = { "Host": "static-data.gaokao.cn", "Origin": "https://www.gaokao.cn", "Referer": f"https://www.gaokao.cn/school/{lst}/provinceline", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3870.400 QQBrowser/10.8.4405.400" } E_url = F"https://static-data.gaokao.cn/www/2.0/schoolplanindex/{match_year}/{lst}/{province}/{subject}/{E_batch}/1.json" try: E_request = requests.get(E_url, headers=math_headers) if requests.status_codes == 200: return E_request.json() except requests.ConnectionError as m: print("errp" + m.args) try: E_get = E_request.json().get("data").get("item") for E_match in E_get: E_data = {} E_data["学校名字"] = school_name E_data["专业名称"] = E_match.get("spname") E_data["学科门类"] = E_match.get("level2_name") E_data["计划招生"] = E_match.get("num") E_data["学制"] = E_match.get("length") yield E_data except AttributeError: print(school_name + ":" + "暂时还没有其内容") print(E_url) # 专业分数线 def Professional_score_line(lst, match_year, subject, province, E_batch): # 7代表本科一批 # 6代表本科提取批 # 10代表专科批 math_headers = { "Host": "static-data.eol.cn", "Origin": "https://gkcx.eol.cn", "Referer": f"https://gkcx.eol.cn/school/{lst}/provinceline", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3870.400 QQBrowser/10.8.4405.400" } E_url = F"https://static-data.eol.cn/www/2.0/schoolplanindex/{match_year}/{lst}/{province}/{subject}/{E_batch}/1.json" try: R_request = requests.get(E_url, headers=math_headers) if requests.status_codes == 200: return R_request.json() except requests.ConnectionError as m: print("errp" + m.args) try: R_get = R_request.json().get("data").get("item") for R_match in R_get: R_data = {} R_data["学校名字"] = school_name R_data["专业名称"] = R_match.get("spname") R_data["录取批次"] = R_match.get("level1_name") R_data["招生类型"] = R_match.get("zslx_name") R_data["平均分"] = R_match.get("average") yield R_data except AttributeError: print(school_name + ":" + "暂时还没有其内容") # 第二单元 def manu(choose, data_id): lst = SCHOOL if choose == 1: match_year = 2021 # (可修改) subject = 2 # (1 = 理科 ) (2 = 文科) (可修改) province = 15 # (在id.txet中自己查询) (可修改) elif choose != 1: match_year = match_yearS # (可修改) subject = subjectS # (1 = 理科 ) (2 = 文科) (可修改) province = provinceS # (在id.txet中自己查询) (可修改) mat = math(lst, match_year, subject, province) # ok en = Enrollment_plan(lst, match_year, subject, province, E_batch) # ok pr = Professional_score_line( lst, match_year, subject, province, E_batch) # ok if choose == 1: return data_id elif choose == 2: for one in mat: return one elif choose == 3: for two in en: return two elif choose == 4: for three in pr: return three def save_josn(result, choose): if choose == 1: name_xls = "school_jiben" elif choose == 2: name_xls = "mat" elif choose == 3: name_xls = "Enrollment_plan" elif choose == 4: name_xls = "Professional_score_line" else: print("出错了") open_file = open(f"{name_xls}.json", mode="a+", encoding="utf-8") j.dump(result, open_file, ensure_ascii=False, indent=4) open_file.close() # xls数据保存(待完成) def save_data(choose,data): # 创建excel工作表 if choose == 1: name_xls = "school_jiben" elif choose == 2: name_xls = "mat" elif choose == 3: name_xls = "Enrollment_plan" elif choose == 4: name_xls = "Professional_score_line" else: print("出错了") if data: with open(f'{name_xls}.csv', mode='a', newline='', encoding='utf8') as cfa: wf = csv.writer(cfa) wf.writerow(data.values()) if __name__ == "__main__": data_a=[] print("城市参数在ID里面") print("获取基本信息输入: 1") print("获取省内分数线输: 2") print("获取计划招生输入: 3") print("获取专业线分数输:4") choose = int(input("请输入查询数据:")) if choose != 1: try: match_yearS = input("输入查询年份:") # (可修改) subjectS = input("选择学科:") # (1 = 理科 ) (2 = 文科) (可修改) provinceS = input("输入查询省份ID:") # (在id.txet中自己查询) (可修改) except (ValueError, UnboundLocalError): print("\033[0:31m 不要乱输入哦\033[0m") exit() print("\033[0:32m 获取成功,爬虫正在爬取,等待一会查看目录下文件即可\033[0m") else: print("\033[0:32m 获取成功,爬虫正在爬取,等待一会查看目录下文件即可\033[0m") for page in range(1, 143): # 更改获取页数,一共143页根据自己需求去获取 json = request_school(page) rf = news(json) for data_id in rf: SCHOOL = data_id.get("学校id") school_name = data_id.get("名字") # 招生计划,分数线本科与专科判断 benke_judge = "普通本科" in data_id.values() if benke_judge == True: #一本7 二本8 专科10 E_batch = 7 elif benke_judge == False: E_batch = 10 else: print(school_name + "还没有其内容") data = manu(choose, data_id) save_josn(data, choose) save_data(choose, data)
[Python] 高考历年分数最低查询
#encoding :utf-8 import requests import json import csv import os import pandas as pd import openpyxl pro_id = { '11':'北京市', '12':'天津市', '13':'河北省', '14':'山西省', '15':'内蒙古自治区', '21':'辽宁省', '22':'吉林省', '23':'黑龙江省', '31':'上海市', '32':'江苏省', '33':'浙江省', '34':'安徽省', '35':'福建省', '36':'江西省', '37':'山东省', '41':'河南省', '42':'湖北省', '43':'湖南省', '44':'广东省', '45':'广西壮族自治区', '46':'海南省', '50':'重庆市', '51':'四川省', '52':'贵州省', '53':'云南省', '54':'西藏自治区', '61':'陕西省', '62':'甘肃省', '63':'青海省', '64':'宁夏回族自治区', '65':'新疆维吾尔自治区', '71':'台湾省', '81':'香港特别行政区', '82':'澳门特别行政区', } typ_dict = {'2073':'物理类', '2074':'历史类', '4':'艺术类', '1':'理科', '2': '文科', '3':'综合', } batch_dict = {'7':'一本(广西、陕西)', '8':'二本(广西、陕西)', '14':'本科(广东、湖南)', '1570':'普通类一段(山东)', '1571':'普通类二段(山东)', } headers = { 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5' } # 获取单个学校id def get_school_id(schuool_name): school_id = -1 reque = 'https://api.eol.cn/gkcx/api/?keyword=%s&uri=apigkcx/api/school/hotlists' % (schuool_name) re_json = requests.get(reque, headers = headers).text re_dict = json.loads(re_json) if 'code' in re_dict.keys(): if re_dict['code'] == '0000': for name in re_dict['data']['item']: if name['name'] == schuool_name: school_id = name['school_id'] # print(school_id,type(school_id)) return school_id # 通过年份获取历年成绩 def get_year_grade(year:int, school_id:int, province_id:int, typ:int ,batch:int): # 年份、学校ID、招生城市、文科/理科、批次 ## typ 理科/文科 1/2 ## province 录取的省份 # item_dist = {'local_batch_name':0, # 'min':0, # 'min_section':0, # 'spname':0, # } item_list = [] # 针对广西 # if batch == 1: # 一本 # batch = 7 # elif batch == 2: # batch = 8 # 二本 # else: # batch = 10 # 转科 for num in range(1,999): ''' https://static-data.gaokao.cn/www/2.0/schoolspecialindex/2021/3593/45/2/10/1.json''' url = 'https://static-data.gaokao.cn/www/2.0/schoolspecialindex/%d/%d/%d/%d/%d/%d.json' % (year, school_id, province_id, typ, batch, num) re_json = requests.get(url, headers = headers).text if re_json == '''""''': break re_dict = json.loads(re_json) if 'code' in re_dict.keys(): for item in re_dict['data']['item']: item_list.append(item) return item_list # 获取省份的全部学校信息(包含id) def get_province_all(province_id:int): '''{"address":"广西壮族自治区北海市银海区金海岸大道666号","admissions":2,"answerurl":"","belong":"广西壮族自治区教育厅","central":2,"city_id":4505,"city_name":"北海市","code_enroll":1476500,"colleges_level":"","county_id":450503,"county_name":"银海区","department":2,"doublehigh":0,"dual_class":38003,"dual_class_name":"","f211":2,"f985":2,"id":"gkschool3593","is_logo":1,"is_recruitment":1,"is_top":2,"level":2002,"level_name":"专科(高职)","name":"北海康养职业学院","nature":36001,"nature_name":"民办","province_id":45,"province_name":"广西","rank":2807,"rank_type":10,"school_id":3593,"school_type":6001,"single_province":"","special":[],"type":0,"type_name":"","view_month":"242","view_month_number":242,"view_total":"3692","view_total_number":3692,"view_week":"0","view_week_number":0,"view_year":3692}''' item_list = [] for page in range(1,99): url = 'https://api.eol.cn/web/api/?&page=%d&province_id=%d&uri=apidata/api/gk/school/lists' % (page, province_id) re_json = requests.get(url, headers = headers).text re_dict = json.loads(re_json) # print(re_dict) if 'code' in re_dict.keys(): if len(re_dict['data']['item']) != 0: for item in re_dict['data']['item']: item_list.append(item) else: break return item_list # https://api.eol.cn/web/api/?&page=8&province_id=45&uri=apidata/api/gk/school/lists # 处理数据,获取学习名字和id def return_all_school_id(school_info:list): # dict_tem = {'name':'','school_id':''} item_list = [] for school in school_info: item_list.append({'name':school['name'], 'school_id':school['school_id']}) # dict_tem['name'] = school['name'] # dict_tem['school_id'] = school['school_id'] # item_list.append(dict_tem) # print(item_list) return item_list # 提取成绩信息 def return_all_grade(grade_info:list): # dict_tem = {} item_list = [] # print(grade_info) for school in grade_info: item_list.append( { 'local_batch_name': school['local_batch_name'], 'spname': school['spname'], 'min': school['min'], 'min_section': school['min_section'], 'type':school['type'], 'province':school['province'] ## 生源地 } ) # print(type(school['type'])) # dict_tem['local_batch_name'] = grade_info['local_batch_name'] ## 批次 # dict_tem['spname'] = grade_info['spname'] ## 专业 # dict_tem['min'] = grade_info['min'] ## 最低分数 # dict_tem['min_section'] = grade_info['min_section'] ## 排名 # item_list.append(dict_tem) return item_list # 写csv文件 def write_csv(path:str,date:list): if not len(date): print('抱歉,啥都没有') return if os.path.exists(path): file_ex = True else: file_ex = False with open(path, 'a+', newline='', encoding='utf-8') as csvfile: hea = list(date[0]) writer = csv.DictWriter(csvfile, fieldnames=hea) if not file_ex: writer.writeheader() writer.writerows(date) csv_excel = pd.read_csv(path, encoding='utf-8') csv_excel.to_excel(path[:-4] + '.xlsx', sheet_name='data') os.remove(path) # 获取某个省份的全部学校录取某个省份的最低分 def get_test(year:int, province_src:int, province_trg:int, typ:int, batch:int, file_path:str): # style_hea = {'学校':'','年份':'','批次':'','专业':'','最低分':'','最低排名':''} school_info = return_all_school_id(get_province_all(province_src)) ## 获取了所有学校的名字和id # print(school_info) test_list = [] num = 0 for i in school_info: num +=1 print(num) gra = get_year_grade(year = year, school_id = i['school_id'], province_id = province_trg, typ = typ, batch = batch) # print(gra) gra = return_all_grade(gra) # print(gra) for g in gra: test_list.append( { '学校' : i['name'], '年份' : 2021, '批次' : g['local_batch_name'], '生源地': pro_id[str(g['province'])], '类别': typ_dict[str(g['type'])], '专业' : g['spname'], '最低分': g['min'], '最低排名' :g['min_section'] } ) # print(g['type']) # style_hea['学校'] = i['name'] # style_hea['年份'] = 2021 # style_hea['批次'] = g['local_batch_name'] # style_hea['专业'] = g['spname'] # style_hea['最低分'] = g['min'] # style_hea['最低排名'] = g['min_section'] # test_list.append(style_hea) # print(test_list) if file_path == '': return test_list write_csv(file_path,test_list) def get_max_mean_min(grade_info:list): item_list = [] # print(grade_info) for school in grade_info: item_list.append( { '批次': school['local_batch_name'], '专业': school['spname'], '最低分': school['min'], '最高分': school['max'], '平均分':school['average'], '最低排名': school['min_section'], # '理科':school['type'], # 'province':school['province'] ## 生源地 } ) for i in item_list: print(i,'\n') # 获取全国对某个省份的专业录取最低分 def get_test_1(year:int, province_trg:int, typ:int, batch:int, file_path): # 年、目标省代码、类别(理科/文科/物理类/历史类...)、批次(本科一批次、二、三之类的)、保存路径 for id in pro_id.keys(): re = get_test(year, int(id), province_trg, typ,batch, '') write_csv(file_path,re) def get_test_2(year:int, school_id:int, typ:int, batch:int, file_path): # test_list = [] # gra = get_year_grade(year, school_id, typ, batch) # gra = return_all_grade(gra) # for g in gra: # test_list.append( # { # '学校' : i['name'], # '年份' : 2021, # '批次' : g['local_batch_name'], # '生源地': pro_id[str(g['province'])], # '类别': typ_dict[str(g['type'])], # '专业' : g['spname'], # '最低分': g['min'], # '最低排名' :g['min_section'] # } # ) # write_csv(file_path,test_list) pass if __name__ =='__main__': # typ_dict = {'2073':'物理类', # '2074':'历史类', # '4':'艺术类', # '1':'理科', # '2': '文科', # '3':'综合', # } # batch_dict = {'7':'一本(广西、陕西)', # '8':'二本(广西、陕西)', # '10':'专科', # '14':'本科(广东、湖南)', # '1570':'普通类一段(山东)', # '1571':'普通类二段(山东)', # '44': '本科二批A段', # '45': '本科二批B段', # '51': '本科一批A段', # } # id = get_school_id('武汉理工大学') # if id != -1: # print(get_year_grade(45,2021,id,1,1)) # ll = get_province_all(45) # for l in ll: # print(l['name']) # print(os.path.exists(r'./123.csv')) # date = [{'test_1':1,'test_2':2},{'test_1':2,'test_2':1}] # write_csv(r'./123.csv',date) # school_info = return_all_school_id(get_province_all()) # get_test() get_test(2021, 45, 45, 2, 7,r'./20220627/2021年广西本科一批次专业最低分数线.csv') get_test(2021, 45, 45, 2, 8,r'./20220627/2021年广西本科二批次专业最低分数线.csv') get_test(2021, 44, 43, 2073, 14,r'./20220627/2021年广东(生源地:湖南)物理类本科批次专业最低分数线.csv') get_test(2021, 45, 43, 2073, 14,r'./20220627/2021年广西(生源地:湖南)物理类本科批次专业最低分数线.csv') get_test(2021, 36, 43, 2073, 14,r'./20220627/2021年江西(生源地:湖南)物理类本科批次专业最低分数线.csv') get_test(2021, 44, 43, 2074, 14,r'./20220627/2021年广东(生源地:湖南)历史类本科批次专业最低分数线.csv') get_test(2021, 45, 43, 2074, 14,r'./20220627/2021年广西(生源地:湖南)历史类本科批次专业最低分数线.csv') get_test(2021, 36, 43, 2074, 14,r'./20220627/2021年江西(生源地:湖南)历史类本科批次专业最低分数线.csv') get_test(2021, 14, 14, 1, 8,r'./20220627/2021年山西理科本科二批次专业最低分数线.csv') get_test(2021, 45, 44, 2074, 14,r'./20220627/2021年广西(生源地:广东)历史类本科批次专业最低分数线.csv') get_test(2021, 36, 44, 2074, 14,r'./20220627/2021年江西(生源地:广东)历史类本科批次专业最低分数线.csv') get_test(2021, 43, 44, 2074, 14,r'./20220627/2021年湖南(生源地:广东)历史类本科批次专业最低分数线.csv') get_test(2021, 65, 43, 2073, 14,r'./20220627/2021年新疆(生源地:湖南)物理类本科批次专业最低分数线.csv') get_test(2021, 46, 43, 2073, 14,r'./20220627/2021年海南(生源地:湖南)物理类本科批次专业最低分数线.csv') get_test(2021, 35, 43, 2073, 14,r'./20220627/2021年福建(生源地:湖南)物理类本科批次专业最低分数线.csv') get_test(2021, 34, 43, 2073, 14,r'./20220627/2021年安徽(生源地:湖南)物理类本科批次专业最低分数线.csv') get_test(2021, 61, 43, 2073, 14,r'./20220627/2021年陕西(生源地:湖南)物理类本科批次专业最低分数线.csv') get_test(2021, 65, 65, 1, 7,r'./20220627/2021年新疆理科本科一批次专业最低分数线.csv') get_test(2021, 65, 65, 1, 8,r'./20220627/2021年新疆文科本科二批次专业最低分数线.csv') get_test(2021, 65, 65, 2, 7,r'./20220627/2021年新疆文科本科一批次专业最低分数线.csv') get_test(2021, 65, 65, 2, 8,r'./20220627/2021年新疆文科本科二批次专业最低分数线.csv') get_test(2021, 53, 53, 1, 7,r'./20220627/2021年云南理科本科一批次专业最低分数线.csv') get_test(2021, 53, 53, 1, 8,r'./20220627/2021年云南理科本科二批次专业最低分数线.csv') get_test(2021, 45, 45, 1, 10,r'./20220627/2021年广西理科专科批次专业最低分数线.csv') get_test(2021, 22, 22, 1, 44,r'./20220627/2021年吉林理科本科二A段批次专业最低分数线.csv') get_test(2021, 22, 22, 2, 44,r'./20220627/2021年吉林理文科本科二A段批次专业最低分数线.csv') get_test(2021, 15, 15, 2, 8,r'./20220627/2021年内蒙古文科本科二批次专业最低分数线.csv')