ADD file via upload

2 years ago · 4ccf264b8e
parent f0ae7b88f9
commit 4ccf264b8e
1 changed files with 115 additions and 0 deletions
--- a/2000300106-BeautifulSoup.py
+++ b/2000300106-BeautifulSoup.py
@ -0,0 +1,115 @@
+from bs4 import BeautifulSoup  #网页解析，获取数据
+import re   #正则表达式，进行文字匹配
+import urllib.request  #制定URL，获取网页数据
+import urllib.error  
+import xlwt  #进行excel操作
+import sqlite3  #进行SQLite数据库操作
+
+def main():
+    baseurl = "https://www.zhangqiaokeyan.com/academic-journal-cn_"
+    #爬取网页
+    datalist = getData(baseurl)
+    savepath = "中文期刊10页.xls"
+    #保存数据
+    saveData(datalist,savepath)
+
+    #askURL("https://www.zhangqiaokeyan.com/academic-journal-cn_1/")
+
+#创建正则表达式，规则
+findLink = re.compile(r'<a href="(.*?)">')  #期刊链接
+findname = re.compile(r'<a.*>(.*)</a>')  #期刊名称
+finditem1 = re.compile(r'<span>(.*)</span>',re.S) 
+finditem2 = re.compile(r'(.*)</span>',re.S)
+finditem3 = re.compile(r'<span>(.*)',re.S)
+findfrequency = re.compile(r'(.*)</span>',re.S) #刊频
+findNUM = re.compile(r'<span>(.*)')  #ISSN和CN
+
+
+
+#爬取网页 
+def getData(baseurl):
+    datalist = []
+    for i in range(0,10):
+        url = baseurl + str(i) + "/"
+        html = askURL(url)
+
+        #逐一解析数据
+        soup = BeautifulSoup(html,"html.parser")
+        for item in soup.find_all('li',class_="list_item"):
+            # #保存一个期刊的全部信息
+            data = []
+            item = str(item)
+
+            #查找指定字符串
+            link = re.findall(findLink,item)[0] #获取期刊链接
+            data.append(link)  #添加链接
+            name = re.findall(findname,item)[0]  #获取期刊名称
+            data.append(name)  #添加期刊名称
+
+            item1 = re.findall(finditem1,item)[0]  
+            item2 = re.findall(finditem2,item1)[0]  
+            item3 = re.findall(finditem3,item1)[0]
+
+            frequency = re.findall(findfrequency,item2)[0]  #获取刊频
+            data.append(frequency)  #添加刊频
+            ISSN = re.findall(findNUM,item2)[0]  #获取ISSN
+            data.append(ISSN)  #添加ISSN
+            CN = re.findall(findNUM,item3)[0]  #获取CN
+            data.append(CN)  #添加CN
+
+            datalist.append(data)
+            #
+    #print(datalist)
+    return datalist
+
+    
+
+
+#得到指定一个URL的网页内容
+def askURL(url):
+    head = {    #模拟浏览器头部信息
+       "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 SLBrowser/8.0.0.7271 SLBChan/105"
+    }
+    #用户代理
+
+    request = urllib.request.Request(url,headers=head)
+    html = ""
+    try:
+        response = urllib.request.urlopen(request)
+        html = response.read().decode("utf-8")
+        #print(html)
+    except urllib.error.URLError as e:
+        if hasattr(e,"code"):
+            print(e.code)
+        if hasattr(e,"reason"):
+            print(e.reason)
+
+    return html
+
+
+#保存数据
+def saveData(datalist,savepath):
+    print("save...")
+    book = xlwt.Workbook(encoding="utf-8",style_compression=0)
+    sheet = book.add_sheet('中文期刊10页',cell_overwrite_ok=True)
+    col = ("期刊链接","期刊名称","刊率","ISSN","CN")
+    for i in range(0,5):
+        sheet.write(0,i,col[i])
+    for i in range(0,200):
+        print("第%d条"%(i+1))
+        data = datalist[i]
+        for j in range(0,5):
+            sheet.write(i+1,j,data[j])
+    book.save(savepath)
+
+    
+
+
+
+
+
+
+if __name__ == "__main__":
+    # 调用函数
+    main()
+    print("爬取完毕!")