You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
77 lines
2.9 KiB
77 lines
2.9 KiB
3 months ago
|
import openpyxl
|
||
|
import requests
|
||
|
from lxml import etree
|
||
|
from tqdm import tqdm
|
||
|
|
||
|
headers = {
|
||
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
|
||
|
}
|
||
|
|
||
|
|
||
|
# 组合日期链接
|
||
|
def cnew_url():
|
||
|
f = open(r'D:/工作文件/cnew_url.txt', 'w', encoding='utf8')
|
||
|
for i in range(1, 6):
|
||
|
if i < 10:
|
||
|
url = 'https://www.chinanews.com.cn/scroll-news/2023/020' + str(i) + '/news.shtml'
|
||
|
else:
|
||
|
url = 'https://www.chinanews.com.cn/scroll-news/2023/02' + str(i) + '/news.shtml'
|
||
|
f.write(url + '\n')
|
||
|
f.close()
|
||
|
|
||
|
|
||
|
def cnew_data():
|
||
|
f = open(r'D:/工作文件/cnew_url.txt', encoding='utf8') # 读取上面已经组合好的链接
|
||
|
l = openpyxl.load_workbook(r'D:\工作文件\cnew_data.xlsx')
|
||
|
sheet = l.active
|
||
|
m = open(r'D:/工作文件/cnew_url1.txt', 'a', encoding='utf8') # 保存报错的链接
|
||
|
x = 1 # 从Excel的第几行开始写入
|
||
|
for i in f:
|
||
|
lj1 = []
|
||
|
# 发起请求,获取页面里面的新闻链接
|
||
|
req = requests.get(i.replace('\n', ''), headers=headers)
|
||
|
# 设置网页编码,不设置会乱码
|
||
|
req.encoding = 'utf8'
|
||
|
ht = etree.HTML(req.text)
|
||
|
# 获取分类的数据还有正文链接
|
||
|
fl = ht.xpath("//div[@class='dd_lm']/a/text()")
|
||
|
lj = ht.xpath("//div[@class='dd_bt']/a/@href")
|
||
|
# 链接有两种格式,分别组合成可以用的
|
||
|
for j in lj:
|
||
|
if j[:5] == '//www':
|
||
|
lj1.append('https:' + j)
|
||
|
else:
|
||
|
lj1.append('https://www.chinanews.com.cn/' + j)
|
||
|
n = 0
|
||
|
for k in tqdm(lj1):
|
||
|
try:
|
||
|
data = []
|
||
|
reqs = requests.get(k, headers=headers, timeout=10)
|
||
|
reqs.encoding = 'utf8'
|
||
|
ht1 = etree.HTML(reqs.text)
|
||
|
bt = ht1.xpath("//h1[@class='content_left_title']/text()") # 标题
|
||
|
if bt:
|
||
|
data.append([fl[n]])
|
||
|
data.append(ht1.xpath("//h1[@class='content_left_title']/text()")) # 标题
|
||
|
data.append(ht1.xpath("//div[@class='left_zw']/p/text()")) # 简介
|
||
|
data.append([lj1[n]])
|
||
|
else:
|
||
|
data.append([fl[n]])
|
||
|
data.append(ht1.xpath("//div[@class='content_title']/div[@class='title']/text()"))
|
||
|
data.append(ht1.xpath("//div[@class='content_desc']/p/text()")) # 简介
|
||
|
data.append([lj1[n]])
|
||
|
for y in range(len(data)):
|
||
|
sheet.cell(x, y + 1).value = '\n'.join(data[y])
|
||
|
x += 1
|
||
|
n += 1
|
||
|
except Exception as arr:
|
||
|
m.write(lj1[n])
|
||
|
continue
|
||
|
l.save(r'D:\工作文件\cnew_data.xlsx')
|
||
|
f.close()
|
||
|
m.close()
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
# cnew_url()
|
||
|
cnew_data()
|