limi 6 months ago
parent f1784e47c3
commit e4d12c2510

3
.idea/.gitignore vendored

@ -0,0 +1,3 @@
# Default ignored files
/shelf/
/workspace.xml

@ -0,0 +1,14 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="JsonStandardCompliance" enabled="false" level="ERROR" enabled_by_default="false" />
<inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredIdentifiers">
<list>
<option value="xw.items" />
<option value="tkinter.messagebox" />
</list>
</option>
</inspection_tool>
</profile>
</component>

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 (base)" project-jdk-type="Python SDK" />
</project>

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/zz01.iml" filepath="$PROJECT_DIR$/.idea/zz01.iml" />
</modules>
</component>
</project>

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

Binary file not shown.

After

Width:  |  Height:  |  Size: 42 KiB

@ -0,0 +1,84 @@
# _*_ coding:utf-8 _*_
import urllib.request
import random
from tqdm import tqdm
from time import sleep
import urllib
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
'''爬取新房数据存入excel'''
# 数据请求构建
def data_request(url):
user_agents = list({
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
'Opera/8.0 (Windows NT 5.1; U; en)',
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0'})
headers = {
'Cookie': "otherid=a17973d936f83e36a1dce147d5401dbc; fang_hao123_layed=1; csrfToken=lcekq8-1Dcmxvkk8JatNlzZr; global_cookie=9t8q5c1pfexde98sri1xy12113pluq89t5t; g_sourcepage=xf_lp%5Elb_pc'; unique_cookie=U_9t8q5c1pfexde98sri1xy12113pluq89t5t*11",
'User-Agent': str(random.choice(user_agents)),
'Connection': 'keep-alive',
}
req = urllib.request.Request(url, headers=headers)
resp = urllib.request.urlopen(req)
content = resp.read().decode('utf-8')
return content
# 数据解析
def data_analysis(content):
html = content
soup = BeautifulSoup(content, 'html.parser')
return soup
# 数据存储
def data_storage(soup):
one_page_data = []
item_list = soup.select('#newhouse_loupan_list>ul>li')
for i in item_list:
# {'名称': None, '户型': None, '面积': None, '地址': None, '价格'}
item = {}
item['名称'] = i.select('.nlcd_name a')[0].text.strip()
n = len(''.join(i.select('.house_type')[0].text.split()).split(''))
if n == 0:
item['户型'], item['面积'] = '', ''
elif n == 1:
item['户型'] = ''.join(i.select('.house_type')[0].text.split()).split('')[0]
item['面积'] = ''
else:
item['户型'], item['面积'] = ''.join(i.select('.house_type')[0].text.split()).split('')
item['面积']=item['面积'].replace('平米','')
item['行政区'] = i.select('.address span.sngrey')
if len(item['行政区'])==0:
item['行政区'] =''
else:
item['行政区'] =item['行政区'][0].text.replace('[','').replace(']','')
item['价格(元/㎡)'] = ''.join(re.findall(r'\d', i.select('.nhouse_price')[0].text))
one_page_data.append(item)
return one_page_data
# 保存文件
def data_files(all_page_data,save_path):
df = pd.DataFrame(all_page_data)
df.to_excel(save_path)
# 主函数
if __name__ == '__main__':
new_all_data=[]
page=25
for i in tqdm(range(1,page+1)):
sleep(1)
url=f'https://cs.newhouse.fang.com/house/s/b9{i}/'
content=data_request(url)
soup = data_analysis(content)
one_page_data = data_storage(soup)
new_all_data.extend(one_page_data)
data_files(new_all_data, './房天下新房数据.xlsx')
# 打开文件
os.startfile(r"房天下新房数据.xlsx")

@ -0,0 +1,43 @@
'''清洗房天下新房数据'''
import os
import pandas as pd
# 读取数据
df1 = pd.read_excel(r'./房天下新房数据.xlsx', index_col=0)
# 清洗行政区\t\n
df1['行政区'] = df1['行政区'].str.replace('\t', '').str.replace('\n', '')
# 处理户型字段
df2 = df1.dropna(subset=['户型'])
# 索引户型list
hxdict = dict(df2['户型'].str.split('/'))
for i, vlist in hxdict.items():
for v in vlist:
df1.loc[i, v] = ''
col_list = df1.columns.tolist()[5:]
for col in col_list:
df1.loc[df1[col].isna(), col] = ''
# 处理面积字段
df3 = df1.dropna(subset=['面积'])
# 索引面积list
mjlist = dict(df3['面积'].str.split('~'))
for i, vlist in mjlist.items():
# 只有一个数据,最大最小面积一样
if len(vlist) == 1:
df1.loc[i, '最小面积(㎡)'] = vlist[0]
df1.loc[i, '最大面积(㎡)'] = vlist[0]
else:
df1.loc[i, '最小面积(㎡)'] = vlist[0]
df1.loc[i, '最大面积(㎡)'] = vlist[1]
# 重排columns顺序
df4 = df1.loc[:, ['名称', '一居', '二居', '三居', '四居', '五居', '五居以上', '最小面积(㎡)', '最大面积(㎡)', '行政区', '价格(元/㎡)']]
# 清除存在空值的行
df4=df4.dropna()
# 重排索引
df4= df4.reset_index(drop=True)
# # 存入清洗后的数据
df4.to_excel(r'./房天下新房数据(清洗后).xlsx')
# 打开文件
os.startfile(r"房天下新房数据(清洗后).xlsx")

@ -0,0 +1,49 @@
#导包
import pandas as pd
import matplotlib.pyplot as plt
# 正确显示
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
df=pd.read_excel('./房天下新房数据(清洗后).xlsx',index_col=0)
df1=df.dropna(subset=['行政区','价格(元/㎡)'])
# 根据行政区分组取平均
df1=df.groupby('行政区').mean(numeric_only=True)
# 行政区平均新房单价分析
plt.title('行政区平均新房单价分析')
df1['价格(元/㎡)'].plot.bar(color='#CCCCFF')
plt.ylabel('价格(元/㎡)')
for i,v in enumerate(df1['价格(元/㎡)']):
plt.text(i-0.35,v+300,round(v))
plt.xticks(rotation=60)
'''
岳麓区的平均新房单价是最高的浏阳的平均新房单价是最低的
'''
plt.savefig('./新房数据展示1.jpg')
# 新房户型占比分析
plt.figure(figsize=(10,10))
plt.title('新房户型占比分析')
df2=df
df2[df2['一居']==''].count()
a1=df2.一居.value_counts()['']
a2=df2.二居.value_counts()['']
a3=df2.三居.value_counts()['']
a4=df2.四居.value_counts()['']
a5=df2.五居.value_counts()['']
colors=['#FFECE5','#E5CCFF','#CCCCFF','#CCFFE5','#FFE5CC']
plt.pie([a1,a2,a3,a4,a5],autopct='%.1f%%',colors=colors,labels=['一居','二居','三居','四居','五居'])
plt.legend()
plt.ylabel('')
plt.savefig('./新房数据展示2.jpg')
# 新房数量占比分析
plt.figure(figsize=(10,10))
df2=df.dropna(subset=['行政区'])
df2=df.groupby('行政区').count()
plt.title('新房数量占比分析')
colors=['#FFECE5','#E5CCFF','#CCCCFF','#CCFFE5','#FFE5CC','#e6e6fa','#9cc3eb','#d0eb9b',"#e6cfe6","#d1fff8"]
df2['名称'].plot.pie(autopct='%.1f%%',colors=colors)
plt.legend()
plt.ylabel('')
plt.savefig('./新房数据展示3.jpg')

65
ui.py

@ -0,0 +1,65 @@
import tkinter as tk
import os
from tkinter import messagebox
import ttkbootstrap as ttk
#执行new_house.py
def open_new_house():
os.system('new_house.py')
# 执行new_house_clean.py
def open_new_house_clean():
os.system('new_house_clean.py')
# 执行new_house_visualization.py
def open_new_house_visualization():
os.system('new_house_visualization.py')
messagebox.showinfo('提醒', '可视化完成!可以点击主页面下面按钮查看图像')
def open_analyse_1():
os.startfile(f"新房数据展示1.jpg")
def open_analyse_2():
os.startfile(f"新房数据展示2.jpg")
def open_analyse_3():
os.startfile(f"新房数据展示3.jpg")
# 窗口对象+美化
window = ttk.Window()
style = ttk.Style("minty")
window.geometry("500x350+400+250")
window.title("房天下长沙新房爬取")
# 图片
canvas = tk.Canvas(window, height=150, width=500)
image_file = tk.PhotoImage(file=r"./IMG_0818.PNG")
image = canvas.create_image(50,20, anchor='nw', image=image_file)
canvas.pack(side='top')
# 开始爬虫
b1=tk.Button(window)
b1['text']="1.开始爬虫"
b1['command']=open_new_house
b1.place(x=110, y=150)
# 清洗数据
b2=tk.Button(window)
b2['text']="2.清洗数据"
b2['command']=open_new_house_clean
b2.place(x=210, y=150)
# 可视化
b3=tk.Button(window)
b3['text']="3.开始可视化"
b3['command']=open_new_house_visualization
b3.place(x=310, y=150)
b4=tk.Button(window)
b4['text']="3.1查看行政区平均新房单价分析图像"
b4['command']=open_analyse_1
b4.place(x=130, y=190)
b5=tk.Button(window)
b5['text']="3.2查看新房户型占比分析图像"
b5['command']=open_analyse_2
b5.place(x=130, y=230)
b6=tk.Button(window)
b6['text']="3.3查看新房数量占比分析图像"
b6['command']=open_analyse_3
b6.place(x=130, y=270)
# 进入消息循环
window.mainloop()

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

Loading…
Cancel
Save