代码爬虫

Qs
渠帅 4 weeks ago
parent 15d0147c59
commit 9020d82ea5

@ -0,0 +1,123 @@
import requests
import csv, os
from pprint import pprint
headers = {
"accept": "application/json, text/plain, */*",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
"content-type": "application/json;charset=UTF-8",
"origin": "https://www.kaoyan.cn",
"priority": "u=1, i",
"referer": "https://www.kaoyan.cn/",
"sec-ch-ua": "\"Google Chrome\";v=\"129\", \"Not=A?Brand\";v=\"8\", \"Chromium\";v=\"129\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-site",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"
}
def file_write(file_path, data_title, file_data):
# 数据写入CSV文件如果文件为空写入tile如果不为空续写内容
# 表头: data_title
file_v = False
try:
# 检测CSV文件是否为空
if os.path.getsize(file_path) == 0:
file_v = True
except:
file_v = True
with open(file_path, 'a', encoding='UTF-8', newline='') as file:
writer = csv.writer(file)
if file_v:
writer.writerow(data_title)
writer.writerow(file_data)
print('写入成功,本次下载已完成。')
def get_html(year, school, page):
data_title = ['年份', '学校名称', '学校类型', '省份', '硕士类型', '专业代码', '专业名称', '总分', '政治', '英语',
'专业课一', '专业课二', '备注']
school_name = school['school_name'] # 学校名称
school_type_name = school['school_type_name'] # 学校类型
school_id = school['school_id'] # 查询ID
province_name = school['province_name'] # 省份
url = f"https://static.kaoyan.cn/json/score/{year}/{school_id}/0/{page}.json"
while True:
try:
response = requests.get(url, headers=headers)
print(url, school_name, f'正在查询第{page}', response)
if response.status_code == 200:
break
if response.status_code == 404:
return 0
except:
print('请求出错,重试中,请稍等。。。')
res_data = response.json()['data']
count = res_data['count']
page_max = count // 10 + 1
item_list = res_data['item']
for item in item_list:
special_code = item['special_code'] # 专业代码
degree_type_name = item['degree_type_name'] # 硕士类型
special_name = item['special_name'] # 专业名称
total = item['total'] # 总分
politics = item['politics'] # 政治
english = item['english'] # 英语
special_one = item['special_one'] # 专业课一
special_two = item['special_two'] # 专业课二
note = item['note'] # 备注
file_data = [year, school_name, school_type_name, province_name, degree_type_name, special_code,
special_name, total, politics, english, special_one, special_two, note]
# print(file_data)
# # 所有分数数据写入文件
# file_write('复试分数线_所有.csv', data_title, file_data)
# 有符合条件的数据写入文件
if special_code == '08' or special_code == '1812' or special_code == '081200':
# print(file_data)
file_write('复试分数线.csv', data_title, file_data)
return page_max # 返回数据页数
def get_degree_type_list():
# 请求学校列表
url = "https://api.kaoyan.cn/pc/special/schoolListV2"
data = {
"page": 1,
"limit": 500,
"province_id": "",
"type": "",
"feature": "",
"spe_id": "20364",
"recruit_type": "",
"sort_type": "",
"english_subject": "",
"math_subject": ""
}
response = requests.post(url, headers=headers, json=data)
print(response)
# pprint(response.json())
res_data_list = response.json()['data']['data']
return res_data_list
def star():
school_list = get_degree_type_list()
for school in school_list:
year_list = ['2022', '2023', '2024'] # 年份
for year in year_list:
page = 1
page_max = page
while page <= page_max and page != 0:
page_max = get_html(year, school, page)
page += 1
if __name__ == '__main__':
star()

@ -0,0 +1,149 @@
import requests
import csv, os
from pprint import pprint
headers = {
"accept": "application/json, text/plain, */*",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
"content-type": "application/json;charset=UTF-8",
"origin": "https://www.kaoyan.cn",
"priority": "u=1, i",
"referer": "https://www.kaoyan.cn/",
"sec-ch-ua": "\"Google Chrome\";v=\"129\", \"Not=A?Brand\";v=\"8\", \"Chromium\";v=\"129\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-site",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"
}
def file_write(file_path, data_title, file_data):
# 数据写入CSV文件如果文件为空写入tile如果不为空续写内容
# 表头: data_title
file_v = False
try:
# 检测CSV文件是否为空
if os.path.getsize(file_path) == 0:
file_v = True
except:
file_v = True
with open(file_path, 'a', encoding='UTF-8', newline='') as file:
writer = csv.writer(file)
if file_v:
writer.writerow(data_title)
writer.writerow(file_data)
print('写入成功,本次下载已完成。')
def get_html(school_id, year):
url = "https://api.kaoyan.cn/pc/school/planList"
data = {
"school_id": school_id,
"page": 1,
"limit": 8000,
"recruit_type": "",
"year": year,
"keyword": "081200",
"is_apply": 2
}
while True:
try:
response = requests.post(url, headers=headers, json=data)
print(school_id, year, response)
if response.status_code == 200:
break
except:
print('请求出错,重试中。。。')
# pprint(response.json())
res_json = response.json()['data']
data_list = res_json['data']
return data_list
def get_degree_type_list():
url = "https://api.kaoyan.cn/pc/special/schoolListV2"
data = {
"page": 1,
"limit": 500,
"province_id": "",
"type": "",
"feature": "",
"spe_id": "20364",
"recruit_type": "",
"sort_type": "",
"english_subject": "",
"math_subject": ""
}
response = requests.post(url, headers=headers, json=data)
print(response)
# pprint(response.json())
res_data_list = response.json()['data']['data']
return res_data_list
def get_success(plan_id):
url = "https://api.kaoyan.cn/pc/school/planDetail"
data = {
"plan_id": plan_id,
"is_apply": 2
}
while True:
try:
response = requests.post(url, headers=headers, json=data)
print(plan_id, response)
if response.status_code == 200:
break
except:
print('请求出错,重试中。。。')
# pprint(response.json())
data_dict = response.json()['data']
degree_type_name = data_dict['degree_type_name'] # 学位类别
year = data_dict['year'] # 招生年份
level1_code = data_dict['level1_code'] # 所属门类代码
level1_name = data_dict['level1_name'] # 所属门类
special_code = data_dict['special_code'] # 专业代码
special_name = data_dict['special_name'] # 专业
depart_name = data_dict['depart_name'] # 所属学院
recruit_type_name = data_dict['recruit_type_name'] # 学习方式
level2_code = data_dict['level2_code'] # 所属学科代码
level2_name = data_dict['level2_name'] # 所属学科
recruit_number = data_dict['recruit_number'] # 拟招生人数
research_area = data_dict['research_area'] # 研究方向
exam_subject = data_dict['exam_subject'] # 初试科目
exam_book = data_dict['exam_book'] # 参考书目
note = data_dict['note'] # 备注
file_data = [degree_type_name, year, level1_code, level1_name, special_code, special_name, depart_name, recruit_type_name,
level2_code, level2_name, recruit_number, research_area, exam_subject, exam_book, note]
return file_data
def star():
data_title = ['学校名称', '学校类型', '省份', '学位类别', '招生年份', '所属门类代码', '所属门类', '专业代码',
'专业', '所属学院',
'学习方式', '所属学科代码', '所属学科', '拟招生人数', '研究方向', '初试科目', '参考书目', '备注', ]
school_list = get_degree_type_list()
for school in school_list:
school_name = school['school_name'] # 学校名称
school_type_name = school['school_type_name'] # 学校类型
school_id = school['school_id'] # 查询ID
province_name = school['province_name'] # 省份
# school_id = '149'
year_list = ['2022', '2023', '2024']
for year in year_list:
success_list = get_html(school_id, year)
for success in success_list:
plan_id = success['plan_id'] # 专业查询ID
# print(plan_id)
data = get_success(plan_id)
file_data = [school_name, school_type_name, province_name] + data
file_write('招生专业.csv', data_title, file_data)
if __name__ == '__main__':
star()
Loading…
Cancel
Save