You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

32 lines
1.3 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import pandas as pd
import numpy as np
filmInfo = pd.read_csv('./file/filmInfo.csv',encoding='gbk')
box_info = pd.read_csv('./file/box_info.csv')
score = pd.read_csv('./file/score.csv',encoding='gbk')
# print(filmInfo)
# print(box_info)
# print(score)
# 将票房列中除数值以外的内容全部删除,之后在将票房中的缺失值替换为0
# filmInfo['票房']=filmInfo['票房'].replace('\D','',regex=True).replace(np.nan,0,regex=True)
# # 将上映时间只保留时间格式的值
# filmInfo['上映时间']=filmInfo['上映时间'].replace('[\u4e00-\u9fa5]','',regex=True)
# # 电影类型只保留第一个
# filmInfo['电影类型']=filmInfo['电影类型'].str.split("/", expand=True)[0]
# # 电影时长只保留数值
# filmInfo['电影时长']=filmInfo['电影时长'].replace('\D','',regex=True)
# print(filmInfo)
#
#将评分表评分缺失的值填补为0
# score['评分']=score['评分'].replace('暂无评分',0,regex=True)
# print(score)
# 将评价表中所有评价整合到一起,并保存到评论txt中
# fp=open('./dataCleanFile/评论.txt','w',encoding='utf-8')
# fp.write(';'.join(box_info['评论']))
# fp.close()
#
#
# 将电影信息表和评分表连接成一个表并存储
film_all=pd.merge(filmInfo, score, on='电影名', how='outer')
# print(film_all)
film_all.to_csv('./dataCleanFile/film_all.csv',index=False)