master
pk2oanwy3 3 years ago
parent ad5be06900
commit 780eb033c5

@ -1,112 +1,18 @@
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import re
import urllib.request, urllib.error
import csv
findcase = re.compile(r'<span style="outline: 0px;max-width: 100%;font-size: 16px;box-sizing: border-box !important;overflow-wrap: break-word !important;"><strong style="outline: 0px;max-width: 100%;box-sizing: border-box !important;overflow-wrap: break-word !important;">(.*?)</strong>')
findbd = re.compile(r'<span style="outline: 0px;max-width: 100%;font-size: 16px;box-sizing: border-box !important;overflow-wrap: break-word !important;">(.*?)</span></p>', re.S)
finddate = re.compile(r'(\d{1,2}月\d{1,2}日|\d{1,2}月\d{1,2}月)')
findtime = re.compile(r'(\d{1,2}:\d{2}-\d{1,2}\d{2}|\d{1,2}\d{2}-\d{1,2}\d{2}|\d{1,2}:\d{2}-\d{1,2}:\d{2}|\d{1,2}\d{2}|\d{1,2}:\d{2})')
findevent = re.compile(r'(\d{1,2}:\d{2}-\d{1,2}\d{2}.*?|\d{1,2}\d{2}-\d{1,2}\d{2}.*?|\d{1,2}:\d{2}-\d{1,2}:\d{2}.*?|\d{1,2}\d{2}.*?|\d{1,2}:\d{2}.*?|\d{1,2}\d{2}-\d{1,2}\d{2}.*?。|\d{1,2}:\d{2}-\d{1,2}:\d{2}.*?。|\d{1,2}\d{2}.*?。|\d{1,2}:\d{2}.*?。)')
def main():
baseurl = "https://mp.weixin.qq.com/s/K0u_qPFQtWuH4hk5K2xWfQ"
datalist = getData(baseurl)
saveData(datalist)
def getData(baseurl):
datalist=[]
data=[]
case=[]
case2=[]
j=0
k=0
url = baseurl
html = askURL(url)
soup = BeautifulSoup(html, "html.parser")
rich=soup.find_all('div',id="js_content")
rich=str(rich)
case1=re.findall(findcase,rich)
k=len(case1)
case2.append('')
case.append(case2)
case2=[]
for i in range(0,k):
case2.append(case1[i])
case.append(case2)
case2=[]
bd = re.findall(findbd,rich)
x=len(bd)
for i in range(0,x):
date=re.findall(finddate,bd[i])
data.append(date)
time=re.findall(findtime,bd[i])
if len(time)==0:
j=j+1
if j<=3:
data.append(case[j])
data.append(time)
event=re.findall(findevent,bd[i])
x=len(event)
for i in range(0,x):
event[i]=re.sub('-',"",event[i])
event[i]=re.sub('\d{1,2}',"",event[i])
event[i]=re.sub('',"",event[i])
event[i]=re.sub(':',"",event[i])
event[i]=re.sub('',"",event[i])
event[i]=re.sub('。',"",event[i])
event[i].replace(" ", "")
data.append(event)
datalist.append(data)
data=[]
return datalist
def askURL(url):
head = {
"User-Agent": "Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 80.0.3987.122 Safari / 537.36"
}
request = urllib.request.Request(url, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
def saveData(datalist):
x=len(datalist)
tcmp=[]
for i in range(0,x-3):
temp=[]
ab=datalist[i]
j=0
if ab[3]:
a=ab[0][0]
b=ab[1][0]
c=ab[2]
d=ab[3]
y=len(d)
for item in c:
temp=[]
temp.append(b)
temp.append(a)
temp.append(a)
temp.append(item)
temp.append(d[j])
if j<y-1:
j=j+1
tcmp.append(temp)
col = ("病例","开始日期","结束日期","时间","事件")
with open('test.csv','w',newline='') as file:
csv_writer=csv.writer(file)
csv_writer.writerow(col)
csv_writer.writerows(tcmp)
if __name__ == "__main__":
main()
print("爬取完毕!")
flag=0
csv_reader=csv.reader(open('test.csv'))
headers=next(csv_reader)
case = input("病例名称:")
Start_date = input("开始日期:")
End_date = input("结束日期:")
keyword = input("查询关键字:")
for row in csv_reader:
if case in row[0]:
if Start_date==row[1]:
if End_date==row[2]:
if keyword in row[4]:
row=' '.join(row)
print(row)
flag=1
if flag==0:
print("未查找到!")
Loading…
Cancel
Save