getdata.py

3 years ago · ad5be06900
parent e2c5f33690
commit ad5be06900
1 changed files with 111 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -1,2 +1,112 @@
-# TDXZ
+# -*- coding: utf-8 -*-
+from bs4 import BeautifulSoup
+import re
+import urllib.request, urllib.error
+import csv
+
+findcase = re.compile(r'<span style="outline: 0px;max-width: 100%;font-size: 16px;box-sizing: border-box !important;overflow-wrap: break-word !important;"><strong style="outline: 0px;max-width: 100%;box-sizing: border-box !important;overflow-wrap: break-word !important;">(.*?)</strong>')
+findbd = re.compile(r'<span style="outline: 0px;max-width: 100%;font-size: 16px;box-sizing: border-box !important;overflow-wrap: break-word !important;">(.*?)</span></p>', re.S)
+finddate = re.compile(r'(\d{1,2}月\d{1,2}日|\d{1,2}月\d{1,2}月)')
+findtime = re.compile(r'(\d{1,2}:\d{2}-\d{1,2}：\d{2}|\d{1,2}：\d{2}-\d{1,2}：\d{2}|\d{1,2}:\d{2}-\d{1,2}:\d{2}|\d{1,2}：\d{2}|\d{1,2}:\d{2})')
+findevent = re.compile(r'(\d{1,2}:\d{2}-\d{1,2}：\d{2}.*?；|\d{1,2}：\d{2}-\d{1,2}：\d{2}.*?；|\d{1,2}:\d{2}-\d{1,2}:\d{2}.*?；|\d{1,2}：\d{2}.*?；|\d{1,2}:\d{2}.*?；|\d{1,2}：\d{2}-\d{1,2}：\d{2}.*?。|\d{1,2}:\d{2}-\d{1,2}:\d{2}.*?。|\d{1,2}：\d{2}.*?。|\d{1,2}:\d{2}.*?。)')
+
+def main():
+    baseurl = "https://mp.weixin.qq.com/s/K0u_qPFQtWuH4hk5K2xWfQ"
+    datalist = getData(baseurl)
+    saveData(datalist)
+
+def getData(baseurl): 
+    datalist=[]
+    data=[]
+    case=[]
+    case2=[]
+    j=0
+    k=0
+    url = baseurl
+    html = askURL(url)
+    soup = BeautifulSoup(html, "html.parser")
+    rich=soup.find_all('div',id="js_content")
+    rich=str(rich)
+    case1=re.findall(findcase,rich)
+    k=len(case1)
+    case2.append('')
+    case.append(case2)
+    case2=[]
+    for i in range(0,k):  
+        case2.append(case1[i])
+        case.append(case2)
+        case2=[]
+    bd = re.findall(findbd,rich)
+    x=len(bd)
+    for i in range(0,x):
+        date=re.findall(finddate,bd[i])         
+        data.append(date)
+        time=re.findall(findtime,bd[i])
+        if len(time)==0: 
+            j=j+1
+        if j<=3:
+            data.append(case[j])  
+        data.append(time)
+        event=re.findall(findevent,bd[i])
+        x=len(event)
+        for i in range(0,x):
+            event[i]=re.sub('-',"",event[i])
+            event[i]=re.sub('\d{1,2}',"",event[i])
+            event[i]=re.sub('：',"",event[i])
+            event[i]=re.sub(':',"",event[i])
+            event[i]=re.sub('；',"",event[i])
+            event[i]=re.sub('。',"",event[i])
+            event[i].replace(" ", "")
+        data.append(event)
+        datalist.append(data)
+        data=[]
+    return datalist
+
+def askURL(url):
+    head = {
+        "User-Agent": "Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 80.0.3987.122  Safari / 537.36"
+    }
+    request = urllib.request.Request(url, headers=head)
+    html = ""
+    try:
+        response = urllib.request.urlopen(request)
+        html = response.read().decode("utf-8")
+    except urllib.error.URLError as e:
+        if hasattr(e, "code"):
+            print(e.code)
+        if hasattr(e, "reason"):
+            print(e.reason)
+    return html
+
+def saveData(datalist): 
+    x=len(datalist)
+    tcmp=[]
+    for i in range(0,x-3):
+        temp=[]
+        ab=datalist[i]
+        j=0
+        if ab[3]:
+            a=ab[0][0]
+            b=ab[1][0]
+            c=ab[2]
+            d=ab[3]
+            y=len(d)
+            for item in c:
+                temp=[]
+                temp.append(b)
+                temp.append(a)
+                temp.append(a)
+                temp.append(item)
+                temp.append(d[j])
+                if j<y-1:
+                    j=j+1
+                tcmp.append(temp)
+    col = ("病例","开始日期","结束日期","时间","事件")
+    with open('test.csv','w',newline='') as file:
+        csv_writer=csv.writer(file)
+        csv_writer.writerow(col)
+        csv_writer.writerows(tcmp)
+if __name__ == "__main__": 
+     main()
+     print("爬取完毕！")