You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
|
|
import requests
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
from lxml import etree
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 发送HTTP请求并获取页面内容
|
|
|
|
|
url = 'https://www.chinanews.com/world.shtml'
|
|
|
|
|
h= {
|
|
|
|
|
'User-Agent':
|
|
|
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0'
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
response = requests.get(url,headers=h)
|
|
|
|
|
response.encoding='utf-8'
|
|
|
|
|
html = response.text
|
|
|
|
|
# print(html)
|
|
|
|
|
|
|
|
|
|
# 使用BeautifulSoup解析HTML内容
|
|
|
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
|
|
|
|
|
|
|
|
# 定位时政新闻的标题和链接
|
|
|
|
|
news_list = soup.find_all('div', class_='content_list')
|
|
|
|
|
# print(news_list[0])
|
|
|
|
|
# print("======================================")
|
|
|
|
|
soup1 = BeautifulSoup(str(news_list),'lxml')
|
|
|
|
|
did = soup1.findAll('div','dd_lm')
|
|
|
|
|
title = soup1.findAll('a')
|
|
|
|
|
link = soup1.findAll('a')
|
|
|
|
|
bid = soup1.findAll('div','dd_time')
|
|
|
|
|
for d,i,j,b in zip(did,title,link,bid):
|
|
|
|
|
print(f'分类:{d.text}\n标题:{i.text}\n链接:https://www.chinanews.com/{j["href"]}\n时间:{b.text}')
|