From 5a1bae3e5b8f63c8b0fe3e77bcaaccb88b253752 Mon Sep 17 00:00:00 2001 From: pagqhcmx2 <3410248804@qq.com> Date: Mon, 27 May 2024 14:31:38 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B8=85=E6=B4=97=E7=88=AC=E5=8F=96=E6=95=B0?= =?UTF-8?q?=E6=8D=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- new_house_clean.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 new_house_clean.py diff --git a/new_house_clean.py b/new_house_clean.py new file mode 100644 index 0000000..dfb46d0 --- /dev/null +++ b/new_house_clean.py @@ -0,0 +1,43 @@ +'''清洗房天下新房数据''' +import os +import pandas as pd +# 读取数据 +df1 = pd.read_excel(r'./房天下新房数据.xlsx', index_col=0) +# 清洗行政区\t\n +df1['行政区'] = df1['行政区'].str.replace('\t', '').str.replace('\n', '') + +# 处理户型字段 +df2 = df1.dropna(subset=['户型']) +# 索引:户型list +hxdict = dict(df2['户型'].str.split('/')) +for i, vlist in hxdict.items(): + for v in vlist: + df1.loc[i, v] = '有' +col_list = df1.columns.tolist()[5:] +for col in col_list: + df1.loc[df1[col].isna(), col] = '无' +# 处理面积字段 +df3 = df1.dropna(subset=['面积']) +# 索引:面积list +mjlist = dict(df3['面积'].str.split('~')) +for i, vlist in mjlist.items(): + # 只有一个数据,最大最小面积一样 + if len(vlist) == 1: + df1.loc[i, '最小面积(㎡)'] = vlist[0] + df1.loc[i, '最大面积(㎡)'] = vlist[0] + else: + df1.loc[i, '最小面积(㎡)'] = vlist[0] + df1.loc[i, '最大面积(㎡)'] = vlist[1] + + +# 重排columns顺序 +df4 = df1.loc[:, ['名称', '一居', '二居', '三居', '四居', '五居', '五居以上', '最小面积(㎡)', '最大面积(㎡)', '行政区', '价格(元/㎡)']] +# 清除存在空值的行 +df4=df4.dropna() +# 重排索引 +df4= df4.reset_index(drop=True) + +# # 存入清洗后的数据 +df4.to_excel(r'./房天下新房数据(清洗后).xlsx') +# 打开文件 +os.startfile(r"房天下新房数据(清洗后).xlsx") \ No newline at end of file