diff --git a/附加题 b/附加题 new file mode 100644 index 0000000..d310b03 --- /dev/null +++ b/附加题 @@ -0,0 +1,77 @@ +import openpyxl +import requests +from lxml import etree +from tqdm import tqdm + +headers = { + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36' +} + + +# 组合日期链接 +def cnew_url(): + f = open(r'D:/工作文件/cnew_url.txt', 'w', encoding='utf8') + for i in range(1, 6): + if i < 10: + url = 'https://www.chinanews.com.cn/scroll-news/2023/020' + str(i) + '/news.shtml' + else: + url = 'https://www.chinanews.com.cn/scroll-news/2023/02' + str(i) + '/news.shtml' + f.write(url + '\n') + f.close() + + +def cnew_data(): + f = open(r'D:/工作文件/cnew_url.txt', encoding='utf8') # 读取上面已经组合好的链接 + l = openpyxl.load_workbook(r'D:\工作文件\cnew_data.xlsx') + sheet = l.active + m = open(r'D:/工作文件/cnew_url1.txt', 'a', encoding='utf8') # 保存报错的链接 + x = 1 # 从Excel的第几行开始写入 + for i in f: + lj1 = [] + # 发起请求,获取页面里面的新闻链接 + req = requests.get(i.replace('\n', ''), headers=headers) + # 设置网页编码,不设置会乱码 + req.encoding = 'utf8' + ht = etree.HTML(req.text) + # 获取分类的数据还有正文链接 + fl = ht.xpath("//div[@class='dd_lm']/a/text()") + lj = ht.xpath("//div[@class='dd_bt']/a/@href") + # 链接有两种格式,分别组合成可以用的 + for j in lj: + if j[:5] == '//www': + lj1.append('https:' + j) + else: + lj1.append('https://www.chinanews.com.cn/' + j) + n = 0 + for k in tqdm(lj1): + try: + data = [] + reqs = requests.get(k, headers=headers, timeout=10) + reqs.encoding = 'utf8' + ht1 = etree.HTML(reqs.text) + bt = ht1.xpath("//h1[@class='content_left_title']/text()") # 标题 + if bt: + data.append([fl[n]]) + data.append(ht1.xpath("//h1[@class='content_left_title']/text()")) # 标题 + data.append(ht1.xpath("//div[@class='left_zw']/p/text()")) # 简介 + data.append([lj1[n]]) + else: + data.append([fl[n]]) + data.append(ht1.xpath("//div[@class='content_title']/div[@class='title']/text()")) + data.append(ht1.xpath("//div[@class='content_desc']/p/text()")) # 简介 + data.append([lj1[n]]) + for y in range(len(data)): + sheet.cell(x, y + 1).value = '\n'.join(data[y]) + x += 1 + n += 1 + except Exception as arr: + m.write(lj1[n]) + continue + l.save(r'D:\工作文件\cnew_data.xlsx') + f.close() + m.close() + + +if __name__ == '__main__': + # cnew_url() + cnew_data() \ No newline at end of file