Delete 'movie_charts.py'

master
hnu202010040227 4 years ago
parent 8c2d5e33d1
commit 7d836cd3e8

@ -1,116 +0,0 @@
# -*- coding: utf-8 -*-
import requests
from lxml import etree
import xlwt
import pandas as pd
def main():
a = 1
j = 1
infos_list = []
for i in range(0, 14):
page = str(i * 25)
url = r'https://www.douban.com/doulist/119166792/?start={}&sort=seq&playable=0&sub_type='.format(page)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36',
}
response = requests.get(url,headers = headers)
content = response.content.decode('utf8')
html = etree.HTML(content)
# 获取电影名
movies = html.xpath('//div[@class="doulist-item"]/div/div[2]/div[4]/a/text()')
movie_name_list = []
for movie in movies:
movie_name_list.append(movie.strip())
movie_name_list = list(filter(None, movie_name_list))
# 获取电影票房
Box_office = html.xpath('//div[@class="doulist-item"]/div/div[3]/div/blockquote/text()')
Box_office_list = []
for Boxs in Box_office:
Box_office_list.append(Boxs.strip())
Box_office_list = list(filter(None, Box_office_list))
# print(Box_office_list)
# 获取评分
score_list = html.xpath('//div[@class="doulist-item"]/div/div[2]/div[5]/span[2]/text()')
# print(score)
# 获取导演以及主演
infos = html.xpath('//div[@class="doulist-item"]/div/div[2]/div[6]/text()')
director_list = []
actor_list = []
type_list = []
area_list = []
year_list = []
for info in infos:
if info.strip().startswith('导演:'):
directors = parse_title(info, '导演:')
director_list.append(directors)
elif info.strip().startswith('主演:'):
actors = parse_title(info, '主演:')
actor_list.append(actors)
elif info.strip().startswith('类型:'):
types = parse_title(info, '类型:')
type_list.append(types)
elif info.strip().startswith('制片国家/地区:'):
areas = parse_title(info, '制片国家/地区:')
area_list.append(areas)
elif info.strip().startswith('年份:'):
years = parse_title(info, '年份:')
year_list.append(years)
for movie_name, Box, score, director, actor, type, area, year in zip(movie_name_list, Box_office_list,
score_list, director_list, actor_list,
type_list, area_list, year_list):
dic = {
'排名':j,
'电影名称': movie_name,
'票房': Box,
'评分': score,
'导演': director,
'主演': actor,
'类型': type,
'地区': area,
'上映年份': year,
}
print(dic)
infos_list.append(dic)
j += 1
print(f'已下载完第{a}页数据!!!!!')
a += 1
save_excel(infos_list)
def parse_title(title,rule):
return title.replace(rule,'').strip()
def save_excel(infos_list):
gushi = infos_list
workbook = xlwt.Workbook(encoding='utf8')
sheet = workbook.add_sheet('古诗文')
keys = list(gushi[0].keys())
for i, key in zip(range(len(keys)), keys):
sheet.write(0, i, key)
for row in range(1, len(gushi) + 1):
for col, key in zip(range(len(keys)), keys):
sheet.write(row, col, gushi[row - 1][key])
workbook.save(r"./电影票房排行榜.xls")
def xls_to_csv_pd():
data_xls = pd.read_excel('电影票房排行榜.xls',index_col=0)
data_xls.to_csv('data.csv',encoding='utf8')
if __name__ == '__main__':
main()
xls_to_csv_pd()
Loading…
Cancel
Save