parent
8c2d5e33d1
commit
7d836cd3e8
@ -1,116 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import requests
|
||||
from lxml import etree
|
||||
import xlwt
|
||||
import pandas as pd
|
||||
|
||||
def main():
|
||||
a = 1
|
||||
j = 1
|
||||
infos_list = []
|
||||
for i in range(0, 14):
|
||||
page = str(i * 25)
|
||||
url = r'https://www.douban.com/doulist/119166792/?start={}&sort=seq&playable=0&sub_type='.format(page)
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36',
|
||||
}
|
||||
response = requests.get(url,headers = headers)
|
||||
content = response.content.decode('utf8')
|
||||
html = etree.HTML(content)
|
||||
|
||||
# 获取电影名
|
||||
movies = html.xpath('//div[@class="doulist-item"]/div/div[2]/div[4]/a/text()')
|
||||
movie_name_list = []
|
||||
for movie in movies:
|
||||
movie_name_list.append(movie.strip())
|
||||
movie_name_list = list(filter(None, movie_name_list))
|
||||
|
||||
# 获取电影票房
|
||||
Box_office = html.xpath('//div[@class="doulist-item"]/div/div[3]/div/blockquote/text()')
|
||||
Box_office_list = []
|
||||
for Boxs in Box_office:
|
||||
Box_office_list.append(Boxs.strip())
|
||||
Box_office_list = list(filter(None, Box_office_list))
|
||||
# print(Box_office_list)
|
||||
|
||||
# 获取评分
|
||||
score_list = html.xpath('//div[@class="doulist-item"]/div/div[2]/div[5]/span[2]/text()')
|
||||
# print(score)
|
||||
|
||||
# 获取导演以及主演
|
||||
infos = html.xpath('//div[@class="doulist-item"]/div/div[2]/div[6]/text()')
|
||||
director_list = []
|
||||
actor_list = []
|
||||
type_list = []
|
||||
area_list = []
|
||||
year_list = []
|
||||
for info in infos:
|
||||
if info.strip().startswith('导演:'):
|
||||
directors = parse_title(info, '导演:')
|
||||
director_list.append(directors)
|
||||
elif info.strip().startswith('主演:'):
|
||||
actors = parse_title(info, '主演:')
|
||||
actor_list.append(actors)
|
||||
elif info.strip().startswith('类型:'):
|
||||
types = parse_title(info, '类型:')
|
||||
type_list.append(types)
|
||||
elif info.strip().startswith('制片国家/地区:'):
|
||||
areas = parse_title(info, '制片国家/地区:')
|
||||
area_list.append(areas)
|
||||
elif info.strip().startswith('年份:'):
|
||||
years = parse_title(info, '年份:')
|
||||
year_list.append(years)
|
||||
|
||||
for movie_name, Box, score, director, actor, type, area, year in zip(movie_name_list, Box_office_list,
|
||||
score_list, director_list, actor_list,
|
||||
type_list, area_list, year_list):
|
||||
dic = {
|
||||
'排名':j,
|
||||
'电影名称': movie_name,
|
||||
'票房': Box,
|
||||
'评分': score,
|
||||
'导演': director,
|
||||
'主演': actor,
|
||||
'类型': type,
|
||||
'地区': area,
|
||||
'上映年份': year,
|
||||
}
|
||||
print(dic)
|
||||
infos_list.append(dic)
|
||||
j += 1
|
||||
print(f'已下载完第{a}页数据!!!!!')
|
||||
a += 1
|
||||
|
||||
save_excel(infos_list)
|
||||
|
||||
|
||||
|
||||
def parse_title(title,rule):
|
||||
|
||||
return title.replace(rule,'').strip()
|
||||
|
||||
def save_excel(infos_list):
|
||||
|
||||
gushi = infos_list
|
||||
workbook = xlwt.Workbook(encoding='utf8')
|
||||
sheet = workbook.add_sheet('古诗文')
|
||||
keys = list(gushi[0].keys())
|
||||
|
||||
for i, key in zip(range(len(keys)), keys):
|
||||
sheet.write(0, i, key)
|
||||
|
||||
for row in range(1, len(gushi) + 1):
|
||||
for col, key in zip(range(len(keys)), keys):
|
||||
sheet.write(row, col, gushi[row - 1][key])
|
||||
|
||||
workbook.save(r"./电影票房排行榜.xls")
|
||||
|
||||
|
||||
def xls_to_csv_pd():
|
||||
data_xls = pd.read_excel('电影票房排行榜.xls',index_col=0)
|
||||
data_xls.to_csv('data.csv',encoding='utf8')
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
xls_to_csv_pd()
|
||||
|
Loading…
Reference in new issue