You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

43 lines
1.4 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

'''清洗房天下新房数据'''
import os
import pandas as pd
# 读取数据
df1 = pd.read_excel(r'./房天下新房数据.xlsx', index_col=0)
# 清洗行政区\t\n
df1['行政区'] = df1['行政区'].str.replace('\t', '').str.replace('\n', '')
# 处理户型字段
df2 = df1.dropna(subset=['户型'])
# 索引户型list
hxdict = dict(df2['户型'].str.split('/'))
for i, vlist in hxdict.items():
for v in vlist:
df1.loc[i, v] = ''
col_list = df1.columns.tolist()[5:]
for col in col_list:
df1.loc[df1[col].isna(), col] = ''
# 处理面积字段
df3 = df1.dropna(subset=['面积'])
# 索引面积list
mjlist = dict(df3['面积'].str.split('~'))
for i, vlist in mjlist.items():
# 只有一个数据,最大最小面积一样
if len(vlist) == 1:
df1.loc[i, '最小面积(㎡)'] = vlist[0]
df1.loc[i, '最大面积(㎡)'] = vlist[0]
else:
df1.loc[i, '最小面积(㎡)'] = vlist[0]
df1.loc[i, '最大面积(㎡)'] = vlist[1]
# 重排columns顺序
df4 = df1.loc[:, ['名称', '一居', '二居', '三居', '四居', '五居', '五居以上', '最小面积(㎡)', '最大面积(㎡)', '行政区', '价格(元/㎡)']]
# 清除存在空值的行
df4=df4.dropna()
# 重排索引
df4= df4.reset_index(drop=True)
# # 存入清洗后的数据
df4.to_excel(r'./房天下新房数据(清洗后).xlsx')
# 打开文件
os.startfile(r"房天下新房数据(清洗后).xlsx")