From 0a971573e2557738b74ef8e411beafa1fed630f5 Mon Sep 17 00:00:00 2001 From: pqofigsmb <1519223402@qq.com> Date: Thu, 27 May 2021 17:31:34 +0800 Subject: [PATCH] =?UTF-8?q?=E7=88=AC=E5=8F=96=E8=B1=86=E7=93=A3top25?= =?UTF-8?q?=E7=94=B5=E5=BD=B1=E4=BF=A1=E6=81=AF=E5=B9=B6=E7=94=9F=E6=88=90?= =?UTF-8?q?excel=E8=A1=A8=E6=A0=BC=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- SpiderWebsite/demo/views.py | 147 ++++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) diff --git a/SpiderWebsite/demo/views.py b/SpiderWebsite/demo/views.py index 084bbf9..4048fda 100644 --- a/SpiderWebsite/demo/views.py +++ b/SpiderWebsite/demo/views.py @@ -366,3 +366,150 @@ def main(): if __name__ == '__main__': main() + +import re # 正则表达式进行文字匹配 +from bs4 import BeautifulSoup # 网页解析获取数据 +import urllib.error,urllib.request +import xlwt # 进行excel操作 + + + +def main(): + baseurl = "https://movie.douban.com/top250?start=" + # 1.爬取网页 + name= '豆瓣top250' + datalist = begin_spider(baseurl, name) + savepath = "豆瓣电影top25.xls" + dbpath = "movie.db" + # 3.保存数据 + saveData(datalist, savepath) + + + +# 影片详情链接 +findLink = re.compile(r'') # 创建正则表达式对象,表示规则(字符串模式) +# 影片图片 +findImgSrc = re.compile(r'(.*)') +# 影评 +findRating = re.compile(r'(.*)') +# 评价人数 +findJudge = re.compile(r'(\d*)人评价') +# 找到概况 +findInq = re.compile(r'(.*?)') +# 找到影片相关内容 +findBd = re.compile(r'

(.*?)

', re.S) + + +# 爬取网页 +def begin_spider(baseurl, a): + datalist = [] + for i in range(0, 1): # 调用获取页面信息的函数,1次 + url = baseurl + str(i * 25) + html = askURL(url) # 保存获取到的网页源码 + + # 2.逐一解析数据 + soup = BeautifulSoup(html, "html.parser") + for item in soup.find_all('div', class_="item"): # 查找符合要求的字符串,形成列表 + # print(item) # 测试查看电影item信息 + data = [] # 保存一部电影的所有信息 + item = str(item) + + link = re.findall(findLink, item)[0] # re库用来查找,通过正则表达式查找指定字符串 + data.append(link) # 添加链接 + + imgSrc = re.findall(findImgSrc, item)[0] + data.append(imgSrc) # 添加图片 + + titles = re.findall(findTitle, item) + + if len(titles) == 2: + ctitle = titles[0] # 添加中文名 + data.append(ctitle) + otitle = titles[1].replace("/", "") # 去掉无关符号 + data.append(otitle) # 添加外国名 + else: + data.append(titles[0]) + data.append(' ') # 外国名留空 + + rating = re.findall(findRating, item)[0] + data.append(rating) # 添加评分 + + judgeNum = re.findall(findJudge, item)[0] + data.append(judgeNum) # 添加评价人数 + + inq = re.findall(findInq, item)[0] + if len(inq) != 0: + inq = inq[0].replace("。", " ") # 去掉句号 + data.append(inq) # 添加概况 + else: + data.append(" ") # 留空 + bd = re.findall(findBd, item)[0] + bd = re.sub('(\s+)?', " ", bd) # 去掉
+ bd = re.sub('/', " ", bd) # 替换/ + data.append(bd.strip()) # 去掉前后空格 + print(data) + datalist.append(data) # 把处理好的一部电影信息放入datalist + + + return datalist + + + + + + +# 得到指定一个URL的网页内容 +def askURL(url): + head = { # 模拟浏览器头部信息,向网址服务器发送消息 + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit / 537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36L, like Gecko) Chrome/90.0.4430.212 Safari/537.36" + } + # 用户代理,表示告诉豆瓣服务器,我们是什么类型的机器,浏览器(本质上是告诉浏览器,我们可以接收什么水平的信息 + request = urllib.request.Request(url, headers=head) + html = "" + # 异常处理 + try: + response = urllib.request.urlopen(request) + html = response.read().decode("utf-8") + + except urllib.error.URLError as e: + if hasattr((e, "code")): + print(e.code) + if hasattr(e, "reason"): + print(e.reason) + + return html + + +# 保存数据 +def saveData(datalist, savepath): + print("save...") + book = xlwt.Workbook(encoding="utf-8") + sheet = book.add_sheet('豆瓣电影top25') + col = ("电影链接", "图片链接", "影片中文名", "影片外文名", "评分", "评价数", "概况", "相关信息") + for i in range(0, 8): + sheet.write(0, i , col[i]) + for i in range(0, 25): + print("第%d条"%(i+1)) + data = datalist[i] + for j in range(0, 8): + sheet.write(i+1, j, data[j]) + book.save(savepath) + + + + + + + + + + +if __name__ == "__main__": + main() + print("爬取完毕!") + + + +