You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

105 lines
4.5 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

from bs4 import BeautifulSoup
import xlwt
import requests
def ask_url(url):
head = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0"
}
try:
r = requests.get(url, headers=head, timeout=30)
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except:
return ""
def get_data(base_url):
data_list = []
for i in range(0, 10):
url = base_url + str(i + 1)
html = ask_url(url)
if html == "":
continue
soup = BeautifulSoup(html, 'html.parser')
for item in soup.find_all('div', class_="result_item"):
data = {}
if item.div.p.span.string == "疾病":
data['diseaseName'] = item.div.p.a.string
symptoms = []
p = item.find('p', class_='result_item_content_label')
for symptom in p.find_all('a'):
symptoms.append(symptom.string)
sub_url = item.div.p.a.attrs["href"]
sub_html = ask_url(sub_url)
if sub_html == "":
continue
sub_soup = BeautifulSoup(sub_html, 'html.parser')
information_ul = sub_soup.find('ul', class_="information_ul")
for detail in information_ul.find_all('li'):
if detail.i.string == '别名:':
data['diseaseAlias'] = detail.span.string
elif detail.i.string == '发病部位:':
data['siteOfOnset'] = []
for site in detail.span.find_all('a'):
data['siteOfOnset'].append(site.string)
elif detail.i.string == '传染性:':
data['infectivity'] = detail.span.string
elif detail.i.string == '多发人群:':
data['multiplePopulation'] = detail.span.string
elif detail.i.string == '并发症:':
data['complication'] = []
for complication in detail.span.find_all('a'):
data['complication'].append(complication.string)
elif detail.i.string == '挂号科室:':
data['registrationDepartment'] = []
for department in detail.span.find_all('a'):
data['registrationDepartment'].append(department.string)
elif detail.i.string == '临床检查:':
data['clinicalExamination'] = []
for examination in detail.span.find_all('a'):
data['clinicalExamination'].append(examination.string)
elif detail.i.string == '典型症状:':
for symptom in detail.span.find_all('a'):
symptoms.append(symptom.string)
data['commonDrugs'] = symptoms
information_ul1 = sub_soup.find('ul', class_="information_ul information_ul_bottom")
for detail in information_ul1.find_all('li'):
if detail.i.string == '常用药品:':
data['commonDrugs'] = []
for drug in detail.span.find_all('a'):
data['commonDrugs'].append(drug.string)
data_list.append(data)
return data_list
def save_data(data_list, save_path):
book = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = book.add_sheet("疾病", cell_overwrite_ok=True)
col = ("diseaseName", "diseaseAlias", "siteOfOnset", "infectivity", "multiplePopulation", "complication", "registrationDepartment", "clinicalExamination", "commonDrugs")
length = len(data_list)
for i in range(0, 9):
sheet.write(0, i, col[i])
for i in range(0, length):
# print("\r当前进度{:.2f}%".format((i + 1) * 100 / length), end="")
data = data_list[i]
for j in range(0, 9):
if col[j] in data:
sheet.write(i + 1, j, data[col[j]])
book.save(save_path)
return ""
if __name__ == "__main__":
base_url = "https://jbk.39.net/bw/jizhenke_p"
save_path = "D:\数据采集\PycharmProjects\pythonProject\数据采集\文件\疾病.xls"
data_list = get_data(base_url)
save_data(data_list, save_path)