You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.
import requests # 网页请求的库
from bs4 import BeautifulSoup # 网页标签解析的库
import pandas as pd # 用于数据分析, 此处用于excel的持久化
url = ' https://china.chinadaily.com.cn/5bd5639ca3101a87ca8ff636 ' # 将爬取的网址定义成一个变量
rep = requests . get ( url ) # 用requests库请求定义好的网址
rep . encoding = ' utf-8 ' # 网站编码设置
html = rep . text # 获取网站源代码文本
soup = BeautifulSoup ( html , ' lxml ' ) # 将代码转成bs4的对象, 采用lxml的引擎
divs = soup . find_all ( ' div ' , class_ = ' busBox3 ' ) # 获取标签和名称
# print(divs) # 检验是否正确
data = [ ] # 定义一个空列表
for div in divs :
data_dict = { } # 定义一个空字典
title = div . find ( ' h3 ' ) . text # 获取h3标签下的文本, 即“标题”
dt = div . find ( ' b ' ) . text # 获取b标签下的文本, 即“发布时间”
data_dict [ ' 标题 ' ] = title
data_dict [ ' 发布时间 ' ] = dt # 将标题和时间添加到字典中
data . append ( data_dict ) # print(title, dt)
df = pd . DataFrame ( data ) # 转换成一个pandas的对象
df . to_excel ( ' 中国日报.xlsx ' , index = False ) # 另存为excel, 并持久化
print ( f ' 中国日报.xlsx 保存成功! ' ) # 保存至表格当中