You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

58 lines
2.4 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import pandas as pd
import os
# 去重电影网站
def deduplication(folder_path, prefix_condition):
# 初始化一个空的DataFrame用于合并数据
merged_df = pd.DataFrame()
# 遍历文件夹中的所有文件
for filename in os.listdir(folder_path):
# 检查文件是否是Excel文件且文件名前两位是否符合条件
if filename.endswith('.xlsx') and filename[:len(prefix_condition)] == prefix_condition:
# 构建文件的完整路径
file_path = os.path.join(folder_path, filename)
# 读取Excel文件
df = pd.read_excel(file_path, sheet_name='Sheet1')
# 根据'电影网站'列删除重复行
df = df.drop_duplicates(subset='电影网站')
# 将读取的数据合并到merged_df中
merged_df = pd.concat([merged_df, df], ignore_index=True)
# 再次在合并后的DataFrame中删除重复项
merged_df = merged_df.drop_duplicates(subset='电影网站')
# 将合并后的数据写入新的Excel文件
merged_df.to_excel(f'豆瓣电影网站/{prefix_condition}/{prefix_condition}电影网站(整合后).xlsx', sheet_name='Sheet1',
index=False)
# merged_df.to_excel(f'豆瓣电影网站/全部电影(整合)/{prefix_condition}电影网站(整合后).xlsx', sheet_name='Sheet1',
# index=False)
print(f'整合完成文件为:{prefix_condition}电影网站(整合后).xlsx')
if __name__ == '__main__':
# 使用函数,传入文件夹路径和匹配条件
deduplication('.', '韩国')
# 单个文件去重复
# import pandas as pd # 导入pandas库并用pd作为别名用于数据处理和分析
# from openpyxl import load_workbook # 导入openpyxl库的load_workbook函数用于处理Excel文件
#
# # 使用pandas的read_excel函数读取名为'新闻.xlsx'的Excel文件并选择名为'Sheet1'的工作表
# df = pd.read_excel('新闻.xlsx', sheet_name='Sheet1')
#
# # 打印DataFrame df的列名用于确认列标题和后续操作
# print(df.columns)
#
# # 根据'新闻链接'这一列的值删除重复的行,保留唯一的'新闻链接'
# df = df.drop_duplicates(subset='新闻链接')
#
# # 将处理后的DataFrame df写入名为'详情页网址.xlsx'的Excel文件并选择名为'Sheet1'的工作表
# # index=False表示在写入时不包含DataFrame的索引列
# df.to_excel('详情页网址.xlsx', sheet_name='Sheet1', index=False)