# -*- coding: utf-8 -*- from bs4 import BeautifulSoup import re import urllib.request, urllib.error import csv findcase = re.compile(r'(.*?)') findbd = re.compile(r'(.*?)
', re.S) finddate = re.compile(r'(\d{1,2}月\d{1,2}日|\d{1,2}月\d{1,2}月)') findtime = re.compile(r'(\d{1,2}:\d{2}-\d{1,2}:\d{2}|\d{1,2}:\d{2}-\d{1,2}:\d{2}|\d{1,2}:\d{2}-\d{1,2}:\d{2}|\d{1,2}:\d{2}|\d{1,2}:\d{2})') findevent = re.compile(r'(\d{1,2}:\d{2}-\d{1,2}:\d{2}.*?;|\d{1,2}:\d{2}-\d{1,2}:\d{2}.*?;|\d{1,2}:\d{2}-\d{1,2}:\d{2}.*?;|\d{1,2}:\d{2}.*?;|\d{1,2}:\d{2}.*?;|\d{1,2}:\d{2}-\d{1,2}:\d{2}.*?。|\d{1,2}:\d{2}-\d{1,2}:\d{2}.*?。|\d{1,2}:\d{2}.*?。|\d{1,2}:\d{2}.*?。)') def main(): baseurl = "https://mp.weixin.qq.com/s/K0u_qPFQtWuH4hk5K2xWfQ" datalist = getData(baseurl) saveData(datalist) def getData(baseurl): datalist=[] data=[] case=[] case2=[] j=0 k=0 url = baseurl html = askURL(url) soup = BeautifulSoup(html, "html.parser") rich=soup.find_all('div',id="js_content") rich=str(rich) case1=re.findall(findcase,rich) k=len(case1) case2.append('') case.append(case2) case2=[] for i in range(0,k): case2.append(case1[i]) case.append(case2) case2=[] bd = re.findall(findbd,rich) x=len(bd) for i in range(0,x): date=re.findall(finddate,bd[i]) data.append(date) time=re.findall(findtime,bd[i]) if len(time)==0: j=j+1 if j<=3: data.append(case[j]) data.append(time) event=re.findall(findevent,bd[i]) x=len(event) for i in range(0,x): event[i]=re.sub('-',"",event[i]) event[i]=re.sub('\d{1,2}',"",event[i]) event[i]=re.sub(':',"",event[i]) event[i]=re.sub(':',"",event[i]) event[i]=re.sub(';',"",event[i]) event[i]=re.sub('。',"",event[i]) event[i].replace(" ", "") data.append(event) datalist.append(data) data=[] return datalist def askURL(url): head = { "User-Agent": "Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 80.0.3987.122 Safari / 537.36" } request = urllib.request.Request(url, headers=head) html = "" try: response = urllib.request.urlopen(request) html = response.read().decode("utf-8") except urllib.error.URLError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reason) return html def saveData(datalist): x=len(datalist) tcmp=[] for i in range(0,x-3): temp=[] ab=datalist[i] j=0 if ab[3]: a=ab[0][0] b=ab[1][0] c=ab[2] d=ab[3] y=len(d) for item in c: temp=[] temp.append(b) temp.append(a) temp.append(a) temp.append(item) temp.append(d[j]) if j