You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2.7 KiB

import requests import re

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'} urls = ["https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen.html"] for i in range(2, 21): url = f"https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen{-i}.html" urls.append(url)

for url in urls: response = requests.get(url, headers=headers) response.raise_for_status() # 检查请求是否成功 # 打印响应编码和状态码 print(f'URL: {url}: Encoding - {response.encoding}, Status Code - {response.status_code}') # 打印网页源代码 if url == "https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen.html" and response.status_code == 200: print("Page 1 Source Code:") print(response.text[:800]) print() # 空行分隔不同页面的输出

pattern = r'<li><a href="(.*?)" title="(.*?)"><span class="fl"><img.*?>(.*?)</span><span class="fr timee">(.*?)</span></a></li>'
# 使用re.findall找到所有匹配项
matches = re.findall(pattern, response.text, re.DOTALL)
# 打印结果
for match in matches:
    link, title, _, date = match
    print(f"链接: {link}")
    print(f"标题: {title}")
    print(f"发布时间: {date}")
    print()

匹配标题(包含'产教融合')、链接和发布时间

for url in urls: response = requests.get(url, headers=headers) response.raise_for_status() # 检查请求是否成功 pattern = re.compile(r'

  • <a href="([^"]+)" title="([^"]+产教融合[^"]+)">(.+?产教融合.+?)([^<]+)
  • ') matches = pattern.findall(response.text) # 打印结果 for match in matches: link, title, content, time = match print(f"标题: {title.strip()}") print(f"内容: {content.strip()}") print(f"时间: {time.strip()}") print(f"链接: {link}") print()

    提取2024年3月份发布的文章数量以及所有文章标题

    march_2024_count = 0 march_2024_titles = [] for url in urls: response = requests.get(url, headers=headers) response.raise_for_status() # 检查请求是否成功 pattern = r'

  • .*?(2024-03-\d\d)
  • ' # 使用re.findall找到所有匹配的标题和日期 matches = re.findall(pattern, response.text) # 遍历匹配结果 for title, date in matches: if date.startswith('2024-03-'): march_2024_count += 1 march_2024_titles.append(title)

    打印结果

    print(f"2024年3月份发布的文章数量: {march_2024_count}") print(f"所有文章标题:") for title in march_2024_titles: print(title)