You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

115 lines
3.5 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

from bs4 import BeautifulSoup #网页解析,获取数据
import re #正则表达式,进行文字匹配
import urllib.request #制定URL获取网页数据
import urllib.error
import xlwt #进行excel操作
import sqlite3 #进行SQLite数据库操作
def main():
baseurl = "https://www.zhangqiaokeyan.com/academic-journal-cn_"
#爬取网页
datalist = getData(baseurl)
savepath = "中文期刊10页.xls"
#保存数据
saveData(datalist,savepath)
#askURL("https://www.zhangqiaokeyan.com/academic-journal-cn_1/")
#创建正则表达式,规则
findLink = re.compile(r'<a href="(.*?)">') #期刊链接
findname = re.compile(r'<a.*>(.*)</a>') #期刊名称
finditem1 = re.compile(r'<span>(.*)</span>',re.S)
finditem2 = re.compile(r'(.*)</span>',re.S)
finditem3 = re.compile(r'<span>(.*)',re.S)
findfrequency = re.compile(r'(.*)</span>',re.S) #刊频
findNUM = re.compile(r'<span>(.*)') #ISSN和CN
#爬取网页
def getData(baseurl):
datalist = []
for i in range(0,10):
url = baseurl + str(i) + "/"
html = askURL(url)
#逐一解析数据
soup = BeautifulSoup(html,"html.parser")
for item in soup.find_all('li',class_="list_item"):
# #保存一个期刊的全部信息
data = []
item = str(item)
#查找指定字符串
link = re.findall(findLink,item)[0] #获取期刊链接
data.append(link) #添加链接
name = re.findall(findname,item)[0] #获取期刊名称
data.append(name) #添加期刊名称
item1 = re.findall(finditem1,item)[0]
item2 = re.findall(finditem2,item1)[0]
item3 = re.findall(finditem3,item1)[0]
frequency = re.findall(findfrequency,item2)[0] #获取刊频
data.append(frequency) #添加刊频
ISSN = re.findall(findNUM,item2)[0] #获取ISSN
data.append(ISSN) #添加ISSN
CN = re.findall(findNUM,item3)[0] #获取CN
data.append(CN) #添加CN
datalist.append(data)
#
#print(datalist)
return datalist
#得到指定一个URL的网页内容
def askURL(url):
head = { #模拟浏览器头部信息
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 SLBrowser/8.0.0.7271 SLBChan/105"
}
#用户代理
request = urllib.request.Request(url,headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
#print(html)
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
#保存数据
def saveData(datalist,savepath):
print("save...")
book = xlwt.Workbook(encoding="utf-8",style_compression=0)
sheet = book.add_sheet('中文期刊10页',cell_overwrite_ok=True)
col = ("期刊链接","期刊名称","刊率","ISSN","CN")
for i in range(0,5):
sheet.write(0,i,col[i])
for i in range(0,200):
print("%d"%(i+1))
data = datalist[i]
for j in range(0,5):
sheet.write(i+1,j,data[j])
book.save(savepath)
if __name__ == "__main__":
# 调用函数
main()
print("爬取完毕!")