getdataquery/getdata.py

# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import re
import urllib.request, urllib.error
import csv

findcase = re.compile(r'<span style="outline: 0px;max-width: 100%;font-size: 16px;box-sizing: border-box !important;overflow-wrap: break-word !important;"><strong style="outline: 0px;max-width: 100%;box-sizing: border-box !important;overflow-wrap: break-word !important;">(.*?)</strong>')
findbd = re.compile(r'<span style="outline: 0px;max-width: 100%;font-size: 16px;box-sizing: border-box !important;overflow-wrap: break-word !important;">(.*?)</span></p>', re.S)
finddate = re.compile(r'(\d{1,2}月\d{1,2}日|\d{1,2}月\d{1,2}月)')
findtime = re.compile(r'(\d{1,2}:\d{2}-\d{1,2}：\d{2}|\d{1,2}：\d{2}-\d{1,2}：\d{2}|\d{1,2}:\d{2}-\d{1,2}:\d{2}|\d{1,2}：\d{2}|\d{1,2}:\d{2})')
findevent = re.compile(r'(\d{1,2}:\d{2}-\d{1,2}：\d{2}.*?；|\d{1,2}：\d{2}-\d{1,2}：\d{2}.*?；|\d{1,2}:\d{2}-\d{1,2}:\d{2}.*?；|\d{1,2}：\d{2}.*?；|\d{1,2}:\d{2}.*?；|\d{1,2}：\d{2}-\d{1,2}：\d{2}.*?。|\d{1,2}:\d{2}-\d{1,2}:\d{2}.*?。|\d{1,2}：\d{2}.*?。|\d{1,2}:\d{2}.*?。)')

def main():
    baseurl = "https://mp.weixin.qq.com/s/K0u_qPFQtWuH4hk5K2xWfQ"
    datalist = getData(baseurl)
    saveData(datalist)

def getData(baseurl):
    datalist=[]
    data=[]
    case=[]
    case2=[]
    j=0
    k=0
    url = baseurl
    html = askURL(url)
    soup = BeautifulSoup(html, "html.parser")
    rich=soup.find_all('div',id="js_content")
    rich=str(rich)
    case1=re.findall(findcase,rich)
    k=len(case1)
    case2.append('')
    case.append(case2)
    case2=[]
    for i in range(0,k):
        case2.append(case1[i])
        case.append(case2)
        case2=[]
    bd = re.findall(findbd,rich)
    x=len(bd)
    for i in range(0,x):
        date=re.findall(finddate,bd[i])
        data.append(date)
        time=re.findall(findtime,bd[i])
        if len(time)==0:
            j=j+1
        if j<=3:
            data.append(case[j])
        data.append(time)
        event=re.findall(findevent,bd[i])
        x=len(event)
        for i in range(0,x):
            event[i]=re.sub('-',"",event[i])
            event[i]=re.sub('\d{1,2}',"",event[i])
            event[i]=re.sub('：',"",event[i])
            event[i]=re.sub(':',"",event[i])
            event[i]=re.sub('；',"",event[i])
            event[i]=re.sub('。',"",event[i])
            event[i].replace(" ", "")
        data.append(event)
        datalist.append(data)
        data=[]
    return datalist

def askURL(url):
    head = {
        "User-Agent": "Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 80.0.3987.122  Safari / 537.36"
    }
    request = urllib.request.Request(url, headers=head)
    html = ""
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode("utf-8")
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)
    return html

def saveData(datalist):
    x=len(datalist)
    tcmp=[]
    for i in range(0,x-3):
        temp=[]
        ab=datalist[i]
        j=0
        if ab[3]:
            a=ab[0][0]
            b=ab[1][0]
            c=ab[2]
            d=ab[3]
            y=len(d)
            for item in c:
                temp=[]
                temp.append(b)
                temp.append(a)
                temp.append(a)
                temp.append(item)
                temp.append(d[j])
                if j<y-1:
                    j=j+1
                tcmp.append(temp)
    col = ("病例","开始日期","结束日期","时间","事件")
    with open('test.csv','w',newline='') as file:
        csv_writer=csv.writer(file)
        csv_writer.writerow(col)
        csv_writer.writerows(tcmp)
if __name__ == "__main__":
     main()
     print("爬取完毕！")