|
|
@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cells": [
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
|
|
"execution_count": 1,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"outputs": [
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
|
|
|
"text": [
|
|
|
|
|
|
|
|
"下载完成!\n"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
],
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"import requests\n",
|
|
|
|
|
|
|
|
"import re\n",
|
|
|
|
|
|
|
|
"import csv, time\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"# 创建一个保存的csv文件并设置好表头\n",
|
|
|
|
|
|
|
|
"timenow = time.strftime(\"%Y-%m-%d-%H%M%S\", time.localtime())\n",
|
|
|
|
|
|
|
|
"file = '猫眼电影top100榜-%s.csv' % (timenow)\n",
|
|
|
|
|
|
|
|
"# 写入表头\n",
|
|
|
|
|
|
|
|
"with open(file, 'a+', encoding='gb18030', newline='') as f:\n",
|
|
|
|
|
|
|
|
" writer_f = csv.writer(f)\n",
|
|
|
|
|
|
|
|
" writer_f.writerow(['排名', '电影名称', '主演', '上映时间', '评分'])\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"# 定义一个爬取其中一页的电影信息\n",
|
|
|
|
|
|
|
|
"def get_one_page(url):\n",
|
|
|
|
|
|
|
|
" headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0'}\n",
|
|
|
|
|
|
|
|
" response = requests.get(url, headers=headers).text\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
" pattern = re.compile(\n",
|
|
|
|
|
|
|
|
" '<dd>.*?board-index.*?>(.*?)</i>.*?name.*?a.*?>(.*?)</a>.*?star.*?>(.*?)</p>.*?releasetime.*?>(.*?)</p>.*?integer.*?>(.*?)</i>.*?fraction.*?>(.*?)</i>.*?</dd>',\n",
|
|
|
|
|
|
|
|
" re.S)\n",
|
|
|
|
|
|
|
|
" items = re.findall(pattern, response)\n",
|
|
|
|
|
|
|
|
" # print(items)\n",
|
|
|
|
|
|
|
|
" for item in items:\n",
|
|
|
|
|
|
|
|
" pm = item[0]\n",
|
|
|
|
|
|
|
|
" mc = item[1]\n",
|
|
|
|
|
|
|
|
" zy = item[2].strip()\n",
|
|
|
|
|
|
|
|
" sj = item[3]\n",
|
|
|
|
|
|
|
|
" pf = item[4] + item[5]\n",
|
|
|
|
|
|
|
|
" print(pm, mc, zy, sj, pf)\n",
|
|
|
|
|
|
|
|
" # 写入到csv文件\n",
|
|
|
|
|
|
|
|
" with open(file, 'a+', encoding='gb18030', newline='') as f:\n",
|
|
|
|
|
|
|
|
" writer = csv.writer(f)\n",
|
|
|
|
|
|
|
|
" writer.writerow([pm, mc, zy, sj, pf])\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"# 共有10个分页,分批写入\n",
|
|
|
|
|
|
|
|
"for i in range(10):\n",
|
|
|
|
|
|
|
|
" page = i * 10\n",
|
|
|
|
|
|
|
|
" url = 'https://maoyan.com/board/4?offset=' + str(page)\n",
|
|
|
|
|
|
|
|
"get_one_page(url)\n",
|
|
|
|
|
|
|
|
"time.sleep(1)\n",
|
|
|
|
|
|
|
|
"print('下载完成!')"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
|
|
|
"metadata": {
|
|
|
|
|
|
|
|
"collapsed": true
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
"outputs": [],
|
|
|
|
|
|
|
|
"source": []
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
],
|
|
|
|
|
|
|
|
"metadata": {
|
|
|
|
|
|
|
|
"kernelspec": {
|
|
|
|
|
|
|
|
"display_name": "Python 3",
|
|
|
|
|
|
|
|
"language": "python",
|
|
|
|
|
|
|
|
"name": "python3"
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
"language_info": {
|
|
|
|
|
|
|
|
"codemirror_mode": {
|
|
|
|
|
|
|
|
"name": "ipython",
|
|
|
|
|
|
|
|
"version": 3
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
"file_extension": ".py",
|
|
|
|
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
|
|
|
|
"name": "python",
|
|
|
|
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
|
|
|
|
"pygments_lexer": "ipython3",
|
|
|
|
|
|
|
|
"version": "3.6.1"
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
"nbformat": 4,
|
|
|
|
|
|
|
|
"nbformat_minor": 2
|
|
|
|
|
|
|
|
}
|