From 36ecd721ed78eabfc754efe92402c1ee7f1a846a Mon Sep 17 00:00:00 2001 From: p6sitlwaz <284768853@qq.com> Date: Tue, 18 Jun 2024 16:51:35 +0800 Subject: [PATCH] =?UTF-8?q?=E6=95=B0=E6=8D=AE=E9=87=87=E9=9B=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 期末zy.py | 104 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 期末zy.py diff --git a/期末zy.py b/期末zy.py new file mode 100644 index 0000000..6ff9691 --- /dev/null +++ b/期末zy.py @@ -0,0 +1,104 @@ +from bs4 import BeautifulSoup +import xlwt +import requests +def ask_url(url): + head = { + "User-Agent": +"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0" + } + try: + r = requests.get(url, headers=head, timeout=30) + r.raise_for_status() + r.encoding = 'utf-8' + return r.text + except: + return "" + +def get_data(base_url): + data_list = [] + + for i in range(0, 10): + url = base_url + str(i + 1) + html = ask_url(url) + if html == "": + continue + soup = BeautifulSoup(html, 'html.parser') + for item in soup.find_all('div', class_="result_item"): + data = {} + + if item.div.p.span.string == "疾病": + data['diseaseName'] = item.div.p.a.string + + symptoms = [] + p = item.find('p', class_='result_item_content_label') + for symptom in p.find_all('a'): + symptoms.append(symptom.string) + + sub_url = item.div.p.a.attrs["href"] + sub_html = ask_url(sub_url) + if sub_html == "": + continue + sub_soup = BeautifulSoup(sub_html, 'html.parser') + + information_ul = sub_soup.find('ul', class_="information_ul") + for detail in information_ul.find_all('li'): + if detail.i.string == '别名:': + data['diseaseAlias'] = detail.span.string + elif detail.i.string == '发病部位:': + data['siteOfOnset'] = [] + for site in detail.span.find_all('a'): + data['siteOfOnset'].append(site.string) + elif detail.i.string == '传染性:': + data['infectivity'] = detail.span.string + elif detail.i.string == '多发人群:': + data['multiplePopulation'] = detail.span.string + elif detail.i.string == '并发症:': + data['complication'] = [] + for complication in detail.span.find_all('a'): + data['complication'].append(complication.string) + elif detail.i.string == '挂号科室:': + data['registrationDepartment'] = [] + for department in detail.span.find_all('a'): + data['registrationDepartment'].append(department.string) + elif detail.i.string == '临床检查:': + data['clinicalExamination'] = [] + for examination in detail.span.find_all('a'): + data['clinicalExamination'].append(examination.string) + elif detail.i.string == '典型症状:': + for symptom in detail.span.find_all('a'): + symptoms.append(symptom.string) + data['commonDrugs'] = symptoms + + information_ul1 = sub_soup.find('ul', class_="information_ul information_ul_bottom") + for detail in information_ul1.find_all('li'): + if detail.i.string == '常用药品:': + data['commonDrugs'] = [] + for drug in detail.span.find_all('a'): + data['commonDrugs'].append(drug.string) + + data_list.append(data) + + return data_list + +def save_data(data_list, save_path): + book = xlwt.Workbook(encoding='utf-8', style_compression=0) + sheet = book.add_sheet("疾病", cell_overwrite_ok=True) + col = ("diseaseName", "diseaseAlias", "siteOfOnset", "infectivity", "multiplePopulation", "complication", "registrationDepartment", "clinicalExamination", "commonDrugs") + length = len(data_list) + for i in range(0, 9): + sheet.write(0, i, col[i]) + for i in range(0, length): + # print("\r当前进度:{:.2f}%".format((i + 1) * 100 / length), end="") + data = data_list[i] + for j in range(0, 9): + if col[j] in data: + sheet.write(i + 1, j, data[col[j]]) + book.save(save_path) + return "" + +if __name__ == "__main__": + base_url = "https://jbk.39.net/bw/jizhenke_p" + save_path = "D:\数据采集\PycharmProjects\pythonProject\数据采集\文件\疾病.xls" + + data_list = get_data(base_url) + save_data(data_list, save_path)