From be00828c48ded7e80bef775419649e6aa1913442 Mon Sep 17 00:00:00 2001 From: ppco9heus <3100279293@qq.com> Date: Sat, 23 Apr 2022 11:54:25 +0800 Subject: [PATCH] ADD file via upload --- getdata.py | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 getdata.py diff --git a/getdata.py b/getdata.py new file mode 100644 index 0000000..bf45d2d --- /dev/null +++ b/getdata.py @@ -0,0 +1,99 @@ +# -*- coding: utf-8 -*- +from bs4 import BeautifulSoup +import re +import urllib.request, urllib.error +import csv + +findbd = re.compile(r'(.*?)

', re.S) +finddate = re.compile(r'(\d{1,2}月\d{1,2}日|\d{1,2}月\d{1,2}月)') +findtime = re.compile(r'(\d{1,2}:\d{2}-\d{1,2}:\d{2}|\d{1,2}:\d{2}-\d{1,2}:\d{2}|\d{1,2}:\d{2}-\d{1,2}:\d{2}|\d{1,2}:\d{2}|\d{1,2}:\d{2})') +findevent = re.compile(r'(\d{1,2}:\d{2}-\d{1,2}:\d{2}.*?;|\d{1,2}:\d{2}-\d{1,2}:\d{2}.*?;|\d{1,2}:\d{2}-\d{1,2}:\d{2}.*?;|\d{1,2}:\d{2}.*?;|\d{1,2}:\d{2}.*?;|\d{1,2}:\d{2}-\d{1,2}:\d{2}.*?。|\d{1,2}:\d{2}-\d{1,2}:\d{2}.*?。|\d{1,2}:\d{2}.*?。|\d{1,2}:\d{2}.*?。)') + +def main(): + baseurl = "https://mp.weixin.qq.com/s/K0u_qPFQtWuH4hk5K2xWfQ" + datalist = getData(baseurl) + saveData(datalist) + +def getData(baseurl): + datalist=[] + data=[] + case=[[''],['病例1:'],['病例2:'],['病例3:']] + j=0 + url = baseurl + html = askURL(url) + soup = BeautifulSoup(html, "html.parser") + rich=soup.find_all('div',id="js_content") + rich=str(rich) + bd = re.findall(findbd,rich) + x=len(bd) + for i in range(0,x): + date=re.findall(finddate,bd[i]) + data.append(date) + time=re.findall(findtime,bd[i]) + if len(time)==0: + j=j+1 + if j<=3: + data.append(case[j]) + data.append(time) + event=re.findall(findevent,bd[i]) + x=len(event) + for i in range(0,x): + event[i]=re.sub('-',"",event[i]) + event[i]=re.sub('\d{1,2}',"",event[i]) + event[i]=re.sub(':',"",event[i]) + event[i]=re.sub(':',"",event[i]) + event[i]=re.sub(';',"",event[i]) + event[i]=re.sub('。',"",event[i]) + event[i].replace(" ", "") + data.append(event) + datalist.append(data) + data=[] + return datalist + +def askURL(url): + head = { + "User-Agent": "Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 80.0.3987.122 Safari / 537.36" + } + request = urllib.request.Request(url, headers=head) + html = "" + try: + response = urllib.request.urlopen(request) + html = response.read().decode("utf-8") + except urllib.error.URLError as e: + if hasattr(e, "code"): + print(e.code) + if hasattr(e, "reason"): + print(e.reason) + return html + +def saveData(datalist): + x=len(datalist) + tcmp=[] + for i in range(0,x-3): + temp=[] + ab=datalist[i] + j=0 + if ab[3]: + a=ab[0][0] + b=ab[1][0] + c=ab[2] + d=ab[3] + y=len(d) + for item in c: + temp=[] + temp.append(b) + temp.append(a) + temp.append(a) + temp.append(item) + temp.append(d[j]) + if j