parent
360aa790c3
commit
4f8995d336
@ -0,0 +1,84 @@
|
|||||||
|
# _*_ coding:utf-8 _*_
|
||||||
|
import urllib.request
|
||||||
|
import random
|
||||||
|
from tqdm import tqdm
|
||||||
|
from time import sleep
|
||||||
|
import urllib
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import re
|
||||||
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
'''爬取新房数据存入excel'''
|
||||||
|
# 数据请求构建
|
||||||
|
def data_request(url):
|
||||||
|
user_agents = list({
|
||||||
|
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
|
||||||
|
'Opera/8.0 (Windows NT 5.1; U; en)',
|
||||||
|
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
|
||||||
|
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
|
||||||
|
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0'})
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
'Cookie': "otherid=a17973d936f83e36a1dce147d5401dbc; fang_hao123_layed=1; csrfToken=lcekq8-1Dcmxvkk8JatNlzZr; global_cookie=9t8q5c1pfexde98sri1xy12113pluq89t5t; g_sourcepage=xf_lp%5Elb_pc'; unique_cookie=U_9t8q5c1pfexde98sri1xy12113pluq89t5t*11",
|
||||||
|
'User-Agent': str(random.choice(user_agents)),
|
||||||
|
'Connection': 'keep-alive',
|
||||||
|
|
||||||
|
}
|
||||||
|
req = urllib.request.Request(url, headers=headers)
|
||||||
|
resp = urllib.request.urlopen(req)
|
||||||
|
content = resp.read().decode('utf-8')
|
||||||
|
return content
|
||||||
|
# 数据解析
|
||||||
|
def data_analysis(content):
|
||||||
|
html = content
|
||||||
|
soup = BeautifulSoup(content, 'html.parser')
|
||||||
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|
# 数据存储
|
||||||
|
def data_storage(soup):
|
||||||
|
one_page_data = []
|
||||||
|
item_list = soup.select('#newhouse_loupan_list>ul>li')
|
||||||
|
for i in item_list:
|
||||||
|
# {'名称': None, '户型': None, '面积': None, '地址': None, '价格'}
|
||||||
|
item = {}
|
||||||
|
item['名称'] = i.select('.nlcd_name a')[0].text.strip()
|
||||||
|
n = len(''.join(i.select('.house_type')[0].text.split()).split('—'))
|
||||||
|
if n == 0:
|
||||||
|
item['户型'], item['面积'] = '', ''
|
||||||
|
elif n == 1:
|
||||||
|
item['户型'] = ''.join(i.select('.house_type')[0].text.split()).split('—')[0]
|
||||||
|
item['面积'] = ''
|
||||||
|
else:
|
||||||
|
item['户型'], item['面积'] = ''.join(i.select('.house_type')[0].text.split()).split('—')
|
||||||
|
item['面积']=item['面积'].replace('平米','')
|
||||||
|
item['行政区'] = i.select('.address span.sngrey')
|
||||||
|
if len(item['行政区'])==0:
|
||||||
|
item['行政区'] =''
|
||||||
|
else:
|
||||||
|
item['行政区'] =item['行政区'][0].text.replace('[','').replace(']','')
|
||||||
|
item['价格(元/㎡)'] = ''.join(re.findall(r'\d', i.select('.nhouse_price')[0].text))
|
||||||
|
one_page_data.append(item)
|
||||||
|
return one_page_data
|
||||||
|
|
||||||
|
|
||||||
|
# 保存文件
|
||||||
|
def data_files(all_page_data,save_path):
|
||||||
|
df = pd.DataFrame(all_page_data)
|
||||||
|
df.to_excel(save_path)
|
||||||
|
|
||||||
|
# 主函数
|
||||||
|
if __name__ == '__main__':
|
||||||
|
new_all_data=[]
|
||||||
|
page=25
|
||||||
|
for i in tqdm(range(1,page+1)):
|
||||||
|
sleep(1)
|
||||||
|
url=f'https://cs.newhouse.fang.com/house/s/b9{i}/'
|
||||||
|
content=data_request(url)
|
||||||
|
soup = data_analysis(content)
|
||||||
|
one_page_data = data_storage(soup)
|
||||||
|
new_all_data.extend(one_page_data)
|
||||||
|
data_files(new_all_data, './房天下新房数据.xlsx')
|
||||||
|
# 打开文件
|
||||||
|
os.startfile(r"房天下新房数据.xlsx")
|
||||||
|
|
Loading…
Reference in new issue