parent
f1784e47c3
commit
e4d12c2510
@ -0,0 +1,3 @@
|
||||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
@ -0,0 +1,14 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<profile version="1.0">
|
||||
<option name="myName" value="Project Default" />
|
||||
<inspection_tool class="JsonStandardCompliance" enabled="false" level="ERROR" enabled_by_default="false" />
|
||||
<inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
|
||||
<option name="ignoredIdentifiers">
|
||||
<list>
|
||||
<option value="xw.items" />
|
||||
<option value="tkinter.messagebox" />
|
||||
</list>
|
||||
</option>
|
||||
</inspection_tool>
|
||||
</profile>
|
||||
</component>
|
@ -0,0 +1,6 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
@ -0,0 +1,4 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 (base)" project-jdk-type="Python SDK" />
|
||||
</project>
|
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/zz01.iml" filepath="$PROJECT_DIR$/.idea/zz01.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
After Width: | Height: | Size: 42 KiB |
@ -0,0 +1,84 @@
|
||||
# _*_ coding:utf-8 _*_
|
||||
import urllib.request
|
||||
import random
|
||||
from tqdm import tqdm
|
||||
from time import sleep
|
||||
import urllib
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
import pandas as pd
|
||||
import os
|
||||
'''爬取新房数据存入excel'''
|
||||
# 数据请求构建
|
||||
def data_request(url):
|
||||
user_agents = list({
|
||||
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
|
||||
'Opera/8.0 (Windows NT 5.1; U; en)',
|
||||
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
|
||||
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
|
||||
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0'})
|
||||
|
||||
headers = {
|
||||
'Cookie': "otherid=a17973d936f83e36a1dce147d5401dbc; fang_hao123_layed=1; csrfToken=lcekq8-1Dcmxvkk8JatNlzZr; global_cookie=9t8q5c1pfexde98sri1xy12113pluq89t5t; g_sourcepage=xf_lp%5Elb_pc'; unique_cookie=U_9t8q5c1pfexde98sri1xy12113pluq89t5t*11",
|
||||
'User-Agent': str(random.choice(user_agents)),
|
||||
'Connection': 'keep-alive',
|
||||
|
||||
}
|
||||
req = urllib.request.Request(url, headers=headers)
|
||||
resp = urllib.request.urlopen(req)
|
||||
content = resp.read().decode('utf-8')
|
||||
return content
|
||||
# 数据解析
|
||||
def data_analysis(content):
|
||||
html = content
|
||||
soup = BeautifulSoup(content, 'html.parser')
|
||||
return soup
|
||||
|
||||
|
||||
# 数据存储
|
||||
def data_storage(soup):
|
||||
one_page_data = []
|
||||
item_list = soup.select('#newhouse_loupan_list>ul>li')
|
||||
for i in item_list:
|
||||
# {'名称': None, '户型': None, '面积': None, '地址': None, '价格'}
|
||||
item = {}
|
||||
item['名称'] = i.select('.nlcd_name a')[0].text.strip()
|
||||
n = len(''.join(i.select('.house_type')[0].text.split()).split('—'))
|
||||
if n == 0:
|
||||
item['户型'], item['面积'] = '', ''
|
||||
elif n == 1:
|
||||
item['户型'] = ''.join(i.select('.house_type')[0].text.split()).split('—')[0]
|
||||
item['面积'] = ''
|
||||
else:
|
||||
item['户型'], item['面积'] = ''.join(i.select('.house_type')[0].text.split()).split('—')
|
||||
item['面积']=item['面积'].replace('平米','')
|
||||
item['行政区'] = i.select('.address span.sngrey')
|
||||
if len(item['行政区'])==0:
|
||||
item['行政区'] =''
|
||||
else:
|
||||
item['行政区'] =item['行政区'][0].text.replace('[','').replace(']','')
|
||||
item['价格(元/㎡)'] = ''.join(re.findall(r'\d', i.select('.nhouse_price')[0].text))
|
||||
one_page_data.append(item)
|
||||
return one_page_data
|
||||
|
||||
|
||||
# 保存文件
|
||||
def data_files(all_page_data,save_path):
|
||||
df = pd.DataFrame(all_page_data)
|
||||
df.to_excel(save_path)
|
||||
|
||||
# 主函数
|
||||
if __name__ == '__main__':
|
||||
new_all_data=[]
|
||||
page=25
|
||||
for i in tqdm(range(1,page+1)):
|
||||
sleep(1)
|
||||
url=f'https://cs.newhouse.fang.com/house/s/b9{i}/'
|
||||
content=data_request(url)
|
||||
soup = data_analysis(content)
|
||||
one_page_data = data_storage(soup)
|
||||
new_all_data.extend(one_page_data)
|
||||
data_files(new_all_data, './房天下新房数据.xlsx')
|
||||
# 打开文件
|
||||
os.startfile(r"房天下新房数据.xlsx")
|
||||
|
@ -0,0 +1,65 @@
|
||||
import tkinter as tk
|
||||
import os
|
||||
from tkinter import messagebox
|
||||
import ttkbootstrap as ttk
|
||||
|
||||
#执行new_house.py
|
||||
def open_new_house():
|
||||
os.system('new_house.py')
|
||||
# 执行new_house_clean.py
|
||||
def open_new_house_clean():
|
||||
os.system('new_house_clean.py')
|
||||
# 执行new_house_visualization.py
|
||||
def open_new_house_visualization():
|
||||
os.system('new_house_visualization.py')
|
||||
messagebox.showinfo('提醒', '可视化完成!可以点击主页面下面按钮查看图像')
|
||||
def open_analyse_1():
|
||||
os.startfile(f"新房数据展示1.jpg")
|
||||
def open_analyse_2():
|
||||
os.startfile(f"新房数据展示2.jpg")
|
||||
def open_analyse_3():
|
||||
os.startfile(f"新房数据展示3.jpg")
|
||||
|
||||
# 窗口对象+美化
|
||||
window = ttk.Window()
|
||||
style = ttk.Style("minty")
|
||||
window.geometry("500x350+400+250")
|
||||
window.title("房天下长沙新房爬取")
|
||||
# 图片
|
||||
canvas = tk.Canvas(window, height=150, width=500)
|
||||
image_file = tk.PhotoImage(file=r"./IMG_0818.PNG")
|
||||
image = canvas.create_image(50,20, anchor='nw', image=image_file)
|
||||
canvas.pack(side='top')
|
||||
|
||||
# 开始爬虫
|
||||
b1=tk.Button(window)
|
||||
b1['text']="1.开始爬虫"
|
||||
b1['command']=open_new_house
|
||||
b1.place(x=110, y=150)
|
||||
# 清洗数据
|
||||
b2=tk.Button(window)
|
||||
b2['text']="2.清洗数据"
|
||||
b2['command']=open_new_house_clean
|
||||
b2.place(x=210, y=150)
|
||||
# 可视化
|
||||
b3=tk.Button(window)
|
||||
b3['text']="3.开始可视化"
|
||||
b3['command']=open_new_house_visualization
|
||||
b3.place(x=310, y=150)
|
||||
|
||||
b4=tk.Button(window)
|
||||
b4['text']="3.1查看行政区平均新房单价分析图像"
|
||||
b4['command']=open_analyse_1
|
||||
b4.place(x=130, y=190)
|
||||
|
||||
b5=tk.Button(window)
|
||||
b5['text']="3.2查看新房户型占比分析图像"
|
||||
b5['command']=open_analyse_2
|
||||
b5.place(x=130, y=230)
|
||||
|
||||
b6=tk.Button(window)
|
||||
b6['text']="3.3查看新房数量占比分析图像"
|
||||
b6['command']=open_analyse_3
|
||||
b6.place(x=130, y=270)
|
||||
# 进入消息循环
|
||||
window.mainloop()
|
Binary file not shown.
Binary file not shown.
After Width: | Height: | Size: 26 KiB |
After Width: | Height: | Size: 27 KiB |
After Width: | Height: | Size: 33 KiB |
Loading…
Reference in new issue