|
|
|
@ -0,0 +1,43 @@
|
|
|
|
|
'''清洗房天下新房数据'''
|
|
|
|
|
import os
|
|
|
|
|
import pandas as pd
|
|
|
|
|
# 读取数据
|
|
|
|
|
df1 = pd.read_excel(r'./房天下新房数据.xlsx', index_col=0)
|
|
|
|
|
# 清洗行政区\t\n
|
|
|
|
|
df1['行政区'] = df1['行政区'].str.replace('\t', '').str.replace('\n', '')
|
|
|
|
|
|
|
|
|
|
# 处理户型字段
|
|
|
|
|
df2 = df1.dropna(subset=['户型'])
|
|
|
|
|
# 索引:户型list
|
|
|
|
|
hxdict = dict(df2['户型'].str.split('/'))
|
|
|
|
|
for i, vlist in hxdict.items():
|
|
|
|
|
for v in vlist:
|
|
|
|
|
df1.loc[i, v] = '有'
|
|
|
|
|
col_list = df1.columns.tolist()[5:]
|
|
|
|
|
for col in col_list:
|
|
|
|
|
df1.loc[df1[col].isna(), col] = '无'
|
|
|
|
|
# 处理面积字段
|
|
|
|
|
df3 = df1.dropna(subset=['面积'])
|
|
|
|
|
# 索引:面积list
|
|
|
|
|
mjlist = dict(df3['面积'].str.split('~'))
|
|
|
|
|
for i, vlist in mjlist.items():
|
|
|
|
|
# 只有一个数据,最大最小面积一样
|
|
|
|
|
if len(vlist) == 1:
|
|
|
|
|
df1.loc[i, '最小面积(㎡)'] = vlist[0]
|
|
|
|
|
df1.loc[i, '最大面积(㎡)'] = vlist[0]
|
|
|
|
|
else:
|
|
|
|
|
df1.loc[i, '最小面积(㎡)'] = vlist[0]
|
|
|
|
|
df1.loc[i, '最大面积(㎡)'] = vlist[1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 重排columns顺序
|
|
|
|
|
df4 = df1.loc[:, ['名称', '一居', '二居', '三居', '四居', '五居', '五居以上', '最小面积(㎡)', '最大面积(㎡)', '行政区', '价格(元/㎡)']]
|
|
|
|
|
# 清除存在空值的行
|
|
|
|
|
df4=df4.dropna()
|
|
|
|
|
# 重排索引
|
|
|
|
|
df4= df4.reset_index(drop=True)
|
|
|
|
|
|
|
|
|
|
# # 存入清洗后的数据
|
|
|
|
|
df4.to_excel(r'./房天下新房数据(清洗后).xlsx')
|
|
|
|
|
# 打开文件
|
|
|
|
|
os.startfile(r"房天下新房数据(清洗后).xlsx")
|