diff --git a/期末.py b/期末.py new file mode 100644 index 0000000..8ac74d9 --- /dev/null +++ b/期末.py @@ -0,0 +1,53 @@ +import requests +from lxml import html +etree = html.etree +import pandas as pd #用于数据处理和生成数据框 清洗的第三方模块 + +headers={ + 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0' +} + +urls = [] +url = "https://nj.lianjia.com/ershoufang/" +urls.append(url) +url_fore = "https://nj.lianjia.com/ershoufang/pg" +url_last = "/" +for i in range(2,6): + urls.append(url_fore+str(i)+url_last) + +title = ['小区名', '总价(万)', '单价', '小区介绍'] +df = pd.DataFrame(columns=title)#创建一个空的数据框 df,并使用 title 列表中的元素作为数据框的列名。 + +for url in urls: + response=requests.get(url,headers=headers) + + # 检查响应状态码 + if response.status_code == 200: + # 如果状态码是200,说明请求成功,继续解析HTML内容 + html=response.content.decode(encoding='utf-8') + # print(html) + + soup = etree.HTML(html) + + div_list = soup.xpath('//div[@class="info clear"]') + + ershoufang=[] + for div in div_list: + house_name=div.xpath('.//div[@class="positionInfo"]/a[1]/text()')[0] + priceinfo=div.xpath('.//div[@class="totalPrice totalPrice2"]/span/text()')[0] + price=div.xpath('.//div[@class="unitPrice"]/span/text()')[0] + house_title = div.xpath('.//div[@class="title"]/a/text()')[0] + + ershoufang.append([house_name,priceinfo,price,house_title]) + # print(ershoufang) + + df_temp = pd.DataFrame(ershoufang, columns=title) + df = pd.concat([df, df_temp])#将 df_temp 数据框与现有的数据框 df 合并。 + print(df_temp) + else: + # 如果状态码不是200,说明请求失败,打印错误信息并跳过该页面 + print(f"请求失败,状态码为:{response.status_code},URL为:{url}") + continue + +# df.to_csv('二手房信息.csv', index=False) +