{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "下载完成!\n" ] } ], "source": [ "import requests\n", "import re\n", "import csv, time\n", "\n", "# 创建一个保存的csv文件并设置好表头\n", "timenow = time.strftime(\"%Y-%m-%d-%H%M%S\", time.localtime())\n", "file = '猫眼电影top100榜-%s.csv' % (timenow)\n", "# 写入表头\n", "with open(file, 'a+', encoding='gb18030', newline='') as f:\n", " writer_f = csv.writer(f)\n", " writer_f.writerow(['排名', '电影名称', '主演', '上映时间', '评分'])\n", "\n", "\n", "# 定义一个爬取其中一页的电影信息\n", "def get_one_page(url):\n", " headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0'}\n", " response = requests.get(url, headers=headers).text\n", "\n", " pattern = re.compile(\n", " '
.*?board-index.*?>(.*?).*?name.*?a.*?>(.*?).*?star.*?>(.*?)

.*?releasetime.*?>(.*?)

.*?integer.*?>(.*?).*?fraction.*?>(.*?).*?
',\n", " re.S)\n", " items = re.findall(pattern, response)\n", " # print(items)\n", " for item in items:\n", " pm = item[0]\n", " mc = item[1]\n", " zy = item[2].strip()\n", " sj = item[3]\n", " pf = item[4] + item[5]\n", " print(pm, mc, zy, sj, pf)\n", " # 写入到csv文件\n", " with open(file, 'a+', encoding='gb18030', newline='') as f:\n", " writer = csv.writer(f)\n", " writer.writerow([pm, mc, zy, sj, pf])\n", "\n", "\n", "# 共有10个分页,分批写入\n", "for i in range(10):\n", " page = i * 10\n", " url = 'https://maoyan.com/board/4?offset=' + str(page)\n", "get_one_page(url)\n", "time.sleep(1)\n", "print('下载完成!')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 2 }