爬取房天下数据

main
pagqhcmx2 1 year ago
parent 360aa790c3
commit 4f8995d336

@ -0,0 +1,84 @@
# _*_ coding:utf-8 _*_
import urllib.request
import random
from tqdm import tqdm
from time import sleep
import urllib
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
'''爬取新房数据存入excel'''
# 数据请求构建
def data_request(url):
user_agents = list({
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
'Opera/8.0 (Windows NT 5.1; U; en)',
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0'})
headers = {
'Cookie': "otherid=a17973d936f83e36a1dce147d5401dbc; fang_hao123_layed=1; csrfToken=lcekq8-1Dcmxvkk8JatNlzZr; global_cookie=9t8q5c1pfexde98sri1xy12113pluq89t5t; g_sourcepage=xf_lp%5Elb_pc'; unique_cookie=U_9t8q5c1pfexde98sri1xy12113pluq89t5t*11",
'User-Agent': str(random.choice(user_agents)),
'Connection': 'keep-alive',
}
req = urllib.request.Request(url, headers=headers)
resp = urllib.request.urlopen(req)
content = resp.read().decode('utf-8')
return content
# 数据解析
def data_analysis(content):
html = content
soup = BeautifulSoup(content, 'html.parser')
return soup
# 数据存储
def data_storage(soup):
one_page_data = []
item_list = soup.select('#newhouse_loupan_list>ul>li')
for i in item_list:
# {'名称': None, '户型': None, '面积': None, '地址': None, '价格'}
item = {}
item['名称'] = i.select('.nlcd_name a')[0].text.strip()
n = len(''.join(i.select('.house_type')[0].text.split()).split(''))
if n == 0:
item['户型'], item['面积'] = '', ''
elif n == 1:
item['户型'] = ''.join(i.select('.house_type')[0].text.split()).split('')[0]
item['面积'] = ''
else:
item['户型'], item['面积'] = ''.join(i.select('.house_type')[0].text.split()).split('')
item['面积']=item['面积'].replace('平米','')
item['行政区'] = i.select('.address span.sngrey')
if len(item['行政区'])==0:
item['行政区'] =''
else:
item['行政区'] =item['行政区'][0].text.replace('[','').replace(']','')
item['价格(元/㎡)'] = ''.join(re.findall(r'\d', i.select('.nhouse_price')[0].text))
one_page_data.append(item)
return one_page_data
# 保存文件
def data_files(all_page_data,save_path):
df = pd.DataFrame(all_page_data)
df.to_excel(save_path)
# 主函数
if __name__ == '__main__':
new_all_data=[]
page=25
for i in tqdm(range(1,page+1)):
sleep(1)
url=f'https://cs.newhouse.fang.com/house/s/b9{i}/'
content=data_request(url)
soup = data_analysis(content)
one_page_data = data_storage(soup)
new_all_data.extend(one_page_data)
data_files(new_all_data, './房天下新房数据.xlsx')
# 打开文件
os.startfile(r"房天下新房数据.xlsx")
Loading…
Cancel
Save