diff --git a/IMG_0818.PNG b/IMG_0818.PNG deleted file mode 100644 index aeb0c3d..0000000 Binary files a/IMG_0818.PNG and /dev/null differ diff --git a/README.md b/README.md deleted file mode 100644 index 35ab420..0000000 --- a/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# zz01 - diff --git a/new_house.py b/new_house.py deleted file mode 100644 index fce6692..0000000 --- a/new_house.py +++ /dev/null @@ -1,84 +0,0 @@ -# _*_ coding:utf-8 _*_ -import urllib.request -import random -from tqdm import tqdm -from time import sleep -import urllib -from bs4 import BeautifulSoup -import re -import pandas as pd -import os -'''爬取新房数据存入excel''' -# 数据请求构建 -def data_request(url): - user_agents = list({ - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60', - 'Opera/8.0 (Windows NT 5.1; U; en)', - 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50', - 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50', - 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0'}) - - headers = { - 'Cookie': "otherid=a17973d936f83e36a1dce147d5401dbc; fang_hao123_layed=1; csrfToken=lcekq8-1Dcmxvkk8JatNlzZr; global_cookie=9t8q5c1pfexde98sri1xy12113pluq89t5t; g_sourcepage=xf_lp%5Elb_pc'; unique_cookie=U_9t8q5c1pfexde98sri1xy12113pluq89t5t*11", - 'User-Agent': str(random.choice(user_agents)), - 'Connection': 'keep-alive', - - } - req = urllib.request.Request(url, headers=headers) - resp = urllib.request.urlopen(req) - content = resp.read().decode('utf-8') - return content -# 数据解析 -def data_analysis(content): - html = content - soup = BeautifulSoup(content, 'html.parser') - return soup - - -# 数据存储 -def data_storage(soup): - one_page_data = [] - item_list = soup.select('#newhouse_loupan_list>ul>li') - for i in item_list: - # {'名称': None, '户型': None, '面积': None, '地址': None, '价格'} - item = {} - item['名称'] = i.select('.nlcd_name a')[0].text.strip() - n = len(''.join(i.select('.house_type')[0].text.split()).split('—')) - if n == 0: - item['户型'], item['面积'] = '', '' - elif n == 1: - item['户型'] = ''.join(i.select('.house_type')[0].text.split()).split('—')[0] - item['面积'] = '' - else: - item['户型'], item['面积'] = ''.join(i.select('.house_type')[0].text.split()).split('—') - item['面积']=item['面积'].replace('平米','') - item['行政区'] = i.select('.address span.sngrey') - if len(item['行政区'])==0: - item['行政区'] ='' - else: - item['行政区'] =item['行政区'][0].text.replace('[','').replace(']','') - item['价格(元/㎡)'] = ''.join(re.findall(r'\d', i.select('.nhouse_price')[0].text)) - one_page_data.append(item) - return one_page_data - - -# 保存文件 -def data_files(all_page_data,save_path): - df = pd.DataFrame(all_page_data) - df.to_excel(save_path) - -# 主函数 -if __name__ == '__main__': - new_all_data=[] - page=25 - for i in tqdm(range(1,page+1)): - sleep(1) - url=f'https://cs.newhouse.fang.com/house/s/b9{i}/' - content=data_request(url) - soup = data_analysis(content) - one_page_data = data_storage(soup) - new_all_data.extend(one_page_data) - data_files(new_all_data, './房天下新房数据.xlsx') - # 打开文件 - os.startfile(r"房天下新房数据.xlsx") - diff --git a/new_house_clean.py b/new_house_clean.py deleted file mode 100644 index dfb46d0..0000000 --- a/new_house_clean.py +++ /dev/null @@ -1,43 +0,0 @@ -'''清洗房天下新房数据''' -import os -import pandas as pd -# 读取数据 -df1 = pd.read_excel(r'./房天下新房数据.xlsx', index_col=0) -# 清洗行政区\t\n -df1['行政区'] = df1['行政区'].str.replace('\t', '').str.replace('\n', '') - -# 处理户型字段 -df2 = df1.dropna(subset=['户型']) -# 索引:户型list -hxdict = dict(df2['户型'].str.split('/')) -for i, vlist in hxdict.items(): - for v in vlist: - df1.loc[i, v] = '有' -col_list = df1.columns.tolist()[5:] -for col in col_list: - df1.loc[df1[col].isna(), col] = '无' -# 处理面积字段 -df3 = df1.dropna(subset=['面积']) -# 索引:面积list -mjlist = dict(df3['面积'].str.split('~')) -for i, vlist in mjlist.items(): - # 只有一个数据,最大最小面积一样 - if len(vlist) == 1: - df1.loc[i, '最小面积(㎡)'] = vlist[0] - df1.loc[i, '最大面积(㎡)'] = vlist[0] - else: - df1.loc[i, '最小面积(㎡)'] = vlist[0] - df1.loc[i, '最大面积(㎡)'] = vlist[1] - - -# 重排columns顺序 -df4 = df1.loc[:, ['名称', '一居', '二居', '三居', '四居', '五居', '五居以上', '最小面积(㎡)', '最大面积(㎡)', '行政区', '价格(元/㎡)']] -# 清除存在空值的行 -df4=df4.dropna() -# 重排索引 -df4= df4.reset_index(drop=True) - -# # 存入清洗后的数据 -df4.to_excel(r'./房天下新房数据(清洗后).xlsx') -# 打开文件 -os.startfile(r"房天下新房数据(清洗后).xlsx") \ No newline at end of file diff --git a/new_house_visualization.py b/new_house_visualization.py deleted file mode 100644 index aeac561..0000000 --- a/new_house_visualization.py +++ /dev/null @@ -1,49 +0,0 @@ -#导包 -import pandas as pd -import matplotlib.pyplot as plt -# 正确显示 -plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 -plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 -df=pd.read_excel('./房天下新房数据(清洗后).xlsx',index_col=0) -df1=df.dropna(subset=['行政区','价格(元/㎡)']) -# 根据行政区分组取平均 -df1=df.groupby('行政区').mean(numeric_only=True) - -# 行政区平均新房单价分析 -plt.title('行政区平均新房单价分析') -df1['价格(元/㎡)'].plot.bar(color='#CCCCFF') -plt.ylabel('价格(元/㎡)') -for i,v in enumerate(df1['价格(元/㎡)']): - plt.text(i-0.35,v+300,round(v)) -plt.xticks(rotation=60) -''' -岳麓区的平均新房单价是最高的,浏阳的平均新房单价是最低的 -''' -plt.savefig('./新房数据展示1.jpg') - -# 新房户型占比分析 -plt.figure(figsize=(10,10)) -plt.title('新房户型占比分析') -df2=df -df2[df2['一居']=='有'].count() -a1=df2.一居.value_counts()['有'] -a2=df2.二居.value_counts()['有'] -a3=df2.三居.value_counts()['有'] -a4=df2.四居.value_counts()['有'] -a5=df2.五居.value_counts()['有'] -colors=['#FFECE5','#E5CCFF','#CCCCFF','#CCFFE5','#FFE5CC'] -plt.pie([a1,a2,a3,a4,a5],autopct='%.1f%%',colors=colors,labels=['一居','二居','三居','四居','五居']) -plt.legend() -plt.ylabel('') -plt.savefig('./新房数据展示2.jpg') - -# 新房数量占比分析 -plt.figure(figsize=(10,10)) -df2=df.dropna(subset=['行政区']) -df2=df.groupby('行政区').count() -plt.title('新房数量占比分析') -colors=['#FFECE5','#E5CCFF','#CCCCFF','#CCFFE5','#FFE5CC','#e6e6fa','#9cc3eb','#d0eb9b',"#e6cfe6","#d1fff8"] -df2['名称'].plot.pie(autopct='%.1f%%',colors=colors) -plt.legend() -plt.ylabel('') -plt.savefig('./新房数据展示3.jpg') diff --git a/ui.py b/ui.py deleted file mode 100644 index f1b2492..0000000 --- a/ui.py +++ /dev/null @@ -1,65 +0,0 @@ -import tkinter as tk -import os -from tkinter import messagebox -import ttkbootstrap as ttk - -#执行new_house.py -def open_new_house(): - os.system('new_house.py') -# 执行new_house_clean.py -def open_new_house_clean(): - os.system('new_house_clean.py') -# 执行new_house_visualization.py -def open_new_house_visualization(): - os.system('new_house_visualization.py') - messagebox.showinfo('提醒', '可视化完成!可以点击主页面下面按钮查看图像') -def open_analyse_1(): - os.startfile(f"新房数据展示1.jpg") -def open_analyse_2(): - os.startfile(f"新房数据展示2.jpg") -def open_analyse_3(): - os.startfile(f"新房数据展示3.jpg") - -# 窗口对象+美化 -window = ttk.Window() -style = ttk.Style("minty") -window.geometry("500x350+400+250") -window.title("房天下长沙新房爬取") -# 图片 -canvas = tk.Canvas(window, height=150, width=500) -image_file = tk.PhotoImage(file=r"./IMG_0818.PNG") -image = canvas.create_image(50,20, anchor='nw', image=image_file) -canvas.pack(side='top') - -# 开始爬虫 -b1=tk.Button(window) -b1['text']="1.开始爬虫" -b1['command']=open_new_house -b1.place(x=110, y=150) -# 清洗数据 -b2=tk.Button(window) -b2['text']="2.清洗数据" -b2['command']=open_new_house_clean -b2.place(x=210, y=150) -# 可视化 -b3=tk.Button(window) -b3['text']="3.开始可视化" -b3['command']=open_new_house_visualization -b3.place(x=310, y=150) - -b4=tk.Button(window) -b4['text']="3.1查看行政区平均新房单价分析图像" -b4['command']=open_analyse_1 -b4.place(x=130, y=190) - -b5=tk.Button(window) -b5['text']="3.2查看新房户型占比分析图像" -b5['command']=open_analyse_2 -b5.place(x=130, y=230) - -b6=tk.Button(window) -b6['text']="3.3查看新房数量占比分析图像" -b6['command']=open_analyse_3 -b6.place(x=130, y=270) -# 进入消息循环 -window.mainloop() \ No newline at end of file