|
|
import requests
|
|
|
from lxml import html
|
|
|
etree = html.etree
|
|
|
import pandas as pd #用于数据处理和生成数据框 清洗的第三方模块
|
|
|
|
|
|
headers={
|
|
|
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0'
|
|
|
}
|
|
|
|
|
|
urls = []
|
|
|
url = "https://nj.lianjia.com/ershoufang/"
|
|
|
urls.append(url)
|
|
|
url_fore = "https://nj.lianjia.com/ershoufang/pg"
|
|
|
url_last = "/"
|
|
|
for i in range(2,6):
|
|
|
urls.append(url_fore+str(i)+url_last)
|
|
|
|
|
|
title = ['小区名', '总价(万)', '单价', '小区介绍']
|
|
|
df = pd.DataFrame(columns=title)#创建一个空的数据框 df,并使用 title 列表中的元素作为数据框的列名。
|
|
|
|
|
|
for url in urls:
|
|
|
response=requests.get(url,headers=headers)
|
|
|
|
|
|
# 检查响应状态码
|
|
|
if response.status_code == 200:
|
|
|
# 如果状态码是200,说明请求成功,继续解析HTML内容
|
|
|
html=response.content.decode(encoding='utf-8')
|
|
|
# print(html)
|
|
|
|
|
|
soup = etree.HTML(html)
|
|
|
|
|
|
div_list = soup.xpath('//div[@class="info clear"]')
|
|
|
|
|
|
ershoufang=[]
|
|
|
for div in div_list:
|
|
|
house_name=div.xpath('.//div[@class="positionInfo"]/a[1]/text()')[0]
|
|
|
priceinfo=div.xpath('.//div[@class="totalPrice totalPrice2"]/span/text()')[0]
|
|
|
price=div.xpath('.//div[@class="unitPrice"]/span/text()')[0]
|
|
|
house_title = div.xpath('.//div[@class="title"]/a/text()')[0]
|
|
|
|
|
|
ershoufang.append([house_name,priceinfo,price,house_title])
|
|
|
# print(ershoufang)
|
|
|
|
|
|
df_temp = pd.DataFrame(ershoufang, columns=title)
|
|
|
df = pd.concat([df, df_temp])#将 df_temp 数据框与现有的数据框 df 合并。
|
|
|
print(df_temp)
|
|
|
else:
|
|
|
# 如果状态码不是200,说明请求失败,打印错误信息并跳过该页面
|
|
|
print(f"请求失败,状态码为:{response.status_code},URL为:{url}")
|
|
|
continue
|
|
|
|
|
|
# df.to_csv('二手房信息.csv', index=False)
|
|
|
|