|
|
|
@ -1,2 +1,112 @@
|
|
|
|
|
# TDXZ
|
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
import re
|
|
|
|
|
import urllib.request, urllib.error
|
|
|
|
|
import csv
|
|
|
|
|
|
|
|
|
|
findcase = re.compile(r'<span style="outline: 0px;max-width: 100%;font-size: 16px;box-sizing: border-box !important;overflow-wrap: break-word !important;"><strong style="outline: 0px;max-width: 100%;box-sizing: border-box !important;overflow-wrap: break-word !important;">(.*?)</strong>')
|
|
|
|
|
findbd = re.compile(r'<span style="outline: 0px;max-width: 100%;font-size: 16px;box-sizing: border-box !important;overflow-wrap: break-word !important;">(.*?)</span></p>', re.S)
|
|
|
|
|
finddate = re.compile(r'(\d{1,2}月\d{1,2}日|\d{1,2}月\d{1,2}月)')
|
|
|
|
|
findtime = re.compile(r'(\d{1,2}:\d{2}-\d{1,2}:\d{2}|\d{1,2}:\d{2}-\d{1,2}:\d{2}|\d{1,2}:\d{2}-\d{1,2}:\d{2}|\d{1,2}:\d{2}|\d{1,2}:\d{2})')
|
|
|
|
|
findevent = re.compile(r'(\d{1,2}:\d{2}-\d{1,2}:\d{2}.*?;|\d{1,2}:\d{2}-\d{1,2}:\d{2}.*?;|\d{1,2}:\d{2}-\d{1,2}:\d{2}.*?;|\d{1,2}:\d{2}.*?;|\d{1,2}:\d{2}.*?;|\d{1,2}:\d{2}-\d{1,2}:\d{2}.*?。|\d{1,2}:\d{2}-\d{1,2}:\d{2}.*?。|\d{1,2}:\d{2}.*?。|\d{1,2}:\d{2}.*?。)')
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
baseurl = "https://mp.weixin.qq.com/s/K0u_qPFQtWuH4hk5K2xWfQ"
|
|
|
|
|
datalist = getData(baseurl)
|
|
|
|
|
saveData(datalist)
|
|
|
|
|
|
|
|
|
|
def getData(baseurl):
|
|
|
|
|
datalist=[]
|
|
|
|
|
data=[]
|
|
|
|
|
case=[]
|
|
|
|
|
case2=[]
|
|
|
|
|
j=0
|
|
|
|
|
k=0
|
|
|
|
|
url = baseurl
|
|
|
|
|
html = askURL(url)
|
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
|
rich=soup.find_all('div',id="js_content")
|
|
|
|
|
rich=str(rich)
|
|
|
|
|
case1=re.findall(findcase,rich)
|
|
|
|
|
k=len(case1)
|
|
|
|
|
case2.append('')
|
|
|
|
|
case.append(case2)
|
|
|
|
|
case2=[]
|
|
|
|
|
for i in range(0,k):
|
|
|
|
|
case2.append(case1[i])
|
|
|
|
|
case.append(case2)
|
|
|
|
|
case2=[]
|
|
|
|
|
bd = re.findall(findbd,rich)
|
|
|
|
|
x=len(bd)
|
|
|
|
|
for i in range(0,x):
|
|
|
|
|
date=re.findall(finddate,bd[i])
|
|
|
|
|
data.append(date)
|
|
|
|
|
time=re.findall(findtime,bd[i])
|
|
|
|
|
if len(time)==0:
|
|
|
|
|
j=j+1
|
|
|
|
|
if j<=3:
|
|
|
|
|
data.append(case[j])
|
|
|
|
|
data.append(time)
|
|
|
|
|
event=re.findall(findevent,bd[i])
|
|
|
|
|
x=len(event)
|
|
|
|
|
for i in range(0,x):
|
|
|
|
|
event[i]=re.sub('-',"",event[i])
|
|
|
|
|
event[i]=re.sub('\d{1,2}',"",event[i])
|
|
|
|
|
event[i]=re.sub(':',"",event[i])
|
|
|
|
|
event[i]=re.sub(':',"",event[i])
|
|
|
|
|
event[i]=re.sub(';',"",event[i])
|
|
|
|
|
event[i]=re.sub('。',"",event[i])
|
|
|
|
|
event[i].replace(" ", "")
|
|
|
|
|
data.append(event)
|
|
|
|
|
datalist.append(data)
|
|
|
|
|
data=[]
|
|
|
|
|
return datalist
|
|
|
|
|
|
|
|
|
|
def askURL(url):
|
|
|
|
|
head = {
|
|
|
|
|
"User-Agent": "Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 80.0.3987.122 Safari / 537.36"
|
|
|
|
|
}
|
|
|
|
|
request = urllib.request.Request(url, headers=head)
|
|
|
|
|
html = ""
|
|
|
|
|
try:
|
|
|
|
|
response = urllib.request.urlopen(request)
|
|
|
|
|
html = response.read().decode("utf-8")
|
|
|
|
|
except urllib.error.URLError as e:
|
|
|
|
|
if hasattr(e, "code"):
|
|
|
|
|
print(e.code)
|
|
|
|
|
if hasattr(e, "reason"):
|
|
|
|
|
print(e.reason)
|
|
|
|
|
return html
|
|
|
|
|
|
|
|
|
|
def saveData(datalist):
|
|
|
|
|
x=len(datalist)
|
|
|
|
|
tcmp=[]
|
|
|
|
|
for i in range(0,x-3):
|
|
|
|
|
temp=[]
|
|
|
|
|
ab=datalist[i]
|
|
|
|
|
j=0
|
|
|
|
|
if ab[3]:
|
|
|
|
|
a=ab[0][0]
|
|
|
|
|
b=ab[1][0]
|
|
|
|
|
c=ab[2]
|
|
|
|
|
d=ab[3]
|
|
|
|
|
y=len(d)
|
|
|
|
|
for item in c:
|
|
|
|
|
temp=[]
|
|
|
|
|
temp.append(b)
|
|
|
|
|
temp.append(a)
|
|
|
|
|
temp.append(a)
|
|
|
|
|
temp.append(item)
|
|
|
|
|
temp.append(d[j])
|
|
|
|
|
if j<y-1:
|
|
|
|
|
j=j+1
|
|
|
|
|
tcmp.append(temp)
|
|
|
|
|
col = ("病例","开始日期","结束日期","时间","事件")
|
|
|
|
|
with open('test.csv','w',newline='') as file:
|
|
|
|
|
csv_writer=csv.writer(file)
|
|
|
|
|
csv_writer.writerow(col)
|
|
|
|
|
csv_writer.writerows(tcmp)
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|
|
|
|
|
print("爬取完毕!")
|
|
|
|
|
|
|
|
|
|