|
|
@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
import xlwt
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
|
|
|
def ask_url(url):
|
|
|
|
|
|
|
|
head = {
|
|
|
|
|
|
|
|
"User-Agent":
|
|
|
|
|
|
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0"
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
r = requests.get(url, headers=head, timeout=30)
|
|
|
|
|
|
|
|
r.raise_for_status()
|
|
|
|
|
|
|
|
r.encoding = 'utf-8'
|
|
|
|
|
|
|
|
return r.text
|
|
|
|
|
|
|
|
except:
|
|
|
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_data(base_url):
|
|
|
|
|
|
|
|
data_list = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for i in range(0, 10):
|
|
|
|
|
|
|
|
url = base_url + str(i + 1)
|
|
|
|
|
|
|
|
html = ask_url(url)
|
|
|
|
|
|
|
|
if html == "":
|
|
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
|
|
|
|
for item in soup.find_all('div', class_="result_item"):
|
|
|
|
|
|
|
|
data = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if item.div.p.span.string == "疾病":
|
|
|
|
|
|
|
|
data['diseaseName'] = item.div.p.a.string
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
symptoms = []
|
|
|
|
|
|
|
|
p = item.find('p', class_='result_item_content_label')
|
|
|
|
|
|
|
|
for symptom in p.find_all('a'):
|
|
|
|
|
|
|
|
symptoms.append(symptom.string)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub_url = item.div.p.a.attrs["href"]
|
|
|
|
|
|
|
|
sub_html = ask_url(sub_url)
|
|
|
|
|
|
|
|
if sub_html == "":
|
|
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
sub_soup = BeautifulSoup(sub_html, 'html.parser')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
information_ul = sub_soup.find('ul', class_="information_ul")
|
|
|
|
|
|
|
|
for detail in information_ul.find_all('li'):
|
|
|
|
|
|
|
|
if detail.i.string == '别名:':
|
|
|
|
|
|
|
|
data['diseaseAlias'] = detail.span.string
|
|
|
|
|
|
|
|
elif detail.i.string == '发病部位:':
|
|
|
|
|
|
|
|
data['siteOfOnset'] = []
|
|
|
|
|
|
|
|
for site in detail.span.find_all('a'):
|
|
|
|
|
|
|
|
data['siteOfOnset'].append(site.string)
|
|
|
|
|
|
|
|
elif detail.i.string == '传染性:':
|
|
|
|
|
|
|
|
data['infectivity'] = detail.span.string
|
|
|
|
|
|
|
|
elif detail.i.string == '多发人群:':
|
|
|
|
|
|
|
|
data['multiplePopulation'] = detail.span.string
|
|
|
|
|
|
|
|
elif detail.i.string == '并发症:':
|
|
|
|
|
|
|
|
data['complication'] = []
|
|
|
|
|
|
|
|
for complication in detail.span.find_all('a'):
|
|
|
|
|
|
|
|
data['complication'].append(complication.string)
|
|
|
|
|
|
|
|
elif detail.i.string == '挂号科室:':
|
|
|
|
|
|
|
|
data['registrationDepartment'] = []
|
|
|
|
|
|
|
|
for department in detail.span.find_all('a'):
|
|
|
|
|
|
|
|
data['registrationDepartment'].append(department.string)
|
|
|
|
|
|
|
|
elif detail.i.string == '临床检查:':
|
|
|
|
|
|
|
|
data['clinicalExamination'] = []
|
|
|
|
|
|
|
|
for examination in detail.span.find_all('a'):
|
|
|
|
|
|
|
|
data['clinicalExamination'].append(examination.string)
|
|
|
|
|
|
|
|
elif detail.i.string == '典型症状:':
|
|
|
|
|
|
|
|
for symptom in detail.span.find_all('a'):
|
|
|
|
|
|
|
|
symptoms.append(symptom.string)
|
|
|
|
|
|
|
|
data['commonDrugs'] = symptoms
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
information_ul1 = sub_soup.find('ul', class_="information_ul information_ul_bottom")
|
|
|
|
|
|
|
|
for detail in information_ul1.find_all('li'):
|
|
|
|
|
|
|
|
if detail.i.string == '常用药品:':
|
|
|
|
|
|
|
|
data['commonDrugs'] = []
|
|
|
|
|
|
|
|
for drug in detail.span.find_all('a'):
|
|
|
|
|
|
|
|
data['commonDrugs'].append(drug.string)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data_list.append(data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return data_list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def save_data(data_list, save_path):
|
|
|
|
|
|
|
|
book = xlwt.Workbook(encoding='utf-8', style_compression=0)
|
|
|
|
|
|
|
|
sheet = book.add_sheet("疾病", cell_overwrite_ok=True)
|
|
|
|
|
|
|
|
col = ("diseaseName", "diseaseAlias", "siteOfOnset", "infectivity", "multiplePopulation", "complication", "registrationDepartment", "clinicalExamination", "commonDrugs")
|
|
|
|
|
|
|
|
length = len(data_list)
|
|
|
|
|
|
|
|
for i in range(0, 9):
|
|
|
|
|
|
|
|
sheet.write(0, i, col[i])
|
|
|
|
|
|
|
|
for i in range(0, length):
|
|
|
|
|
|
|
|
# print("\r当前进度:{:.2f}%".format((i + 1) * 100 / length), end="")
|
|
|
|
|
|
|
|
data = data_list[i]
|
|
|
|
|
|
|
|
for j in range(0, 9):
|
|
|
|
|
|
|
|
if col[j] in data:
|
|
|
|
|
|
|
|
sheet.write(i + 1, j, data[col[j]])
|
|
|
|
|
|
|
|
book.save(save_path)
|
|
|
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
|
|
base_url = "https://jbk.39.net/bw/jizhenke_p"
|
|
|
|
|
|
|
|
save_path = "D:\数据采集\PycharmProjects\pythonProject\数据采集\文件\疾病.xls"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data_list = get_data(base_url)
|
|
|
|
|
|
|
|
save_data(data_list, save_path)
|