Update README.md

2 years ago · d993acfad0
parent 7dc8dc4445
commit d993acfad0
1 changed files with 64 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -1,2 +1,65 @@
-# import
+import requests
 import re
 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'}
 urls = ["https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen.html"]
 for i in range(2, 21):
    url = f"https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen{-i}.html"
    urls.append(url)
 for url in urls:
    response = requests.get(url, headers=headers)
    response.raise_for_status()  # 检查请求是否成功
    # 打印响应编码和状态码
    print(f'URL: {url}: Encoding - {response.encoding}, Status Code - {response.status_code}')
    # 打印网页源代码
    if url == "https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen.html" and response.status_code == 200:
        print("Page 1 Source Code:")
        print(response.text[:800])
    print()  # 空行分隔不同页面的输出
    pattern = r'<li><a href="(.*?)" title="(.*?)"><span class="fl"><img.*?>(.*?)</span><span class="fr timee">(.*?)</span></a></li>'
    # 使用re.findall找到所有匹配项
    matches = re.findall(pattern, response.text, re.DOTALL)
    # 打印结果
    for match in matches:
        link, title, _, date = match
        print(f"链接: {link}")
        print(f"标题: {title}")
        print(f"发布时间: {date}")
        print()
 # 匹配标题（包含'产教融合'）、链接和发布时间
 for url in urls:
    response = requests.get(url, headers=headers)
    response.raise_for_status()  # 检查请求是否成功
    pattern = re.compile(r'<li><a href="([^"]+)" title="([^"]+产教融合[^"]+)"><span class="fl">(.+?产教融合.+?)</span><span class="fr timee">([^<]+)</span></a></li>')
    matches = pattern.findall(response.text)
    # 打印结果
    for match in matches:
        link, title, content, time = match
        print(f"标题: {title.strip()}")
        print(f"内容: {content.strip()}")
        print(f"时间: {time.strip()}")
        print(f"链接: {link}")
        print()
 # 提取2024年3月份发布的文章数量以及所有文章标题
 march_2024_count = 0
 march_2024_titles = []
 for url in urls:
    response = requests.get(url, headers=headers)
    response.raise_for_status()  # 检查请求是否成功
    pattern = r'<li><a href=".*?" title="(.*?)"><span class="fl">.*?</span><span class="fr timee">(2024-03-\d\d)</span></a></li>'
    # 使用re.findall找到所有匹配的标题和日期
    matches = re.findall(pattern, response.text)
    # 遍历匹配结果
    for title, date in matches:
        if date.startswith('2024-03-'):
            march_2024_count += 1
            march_2024_titles.append(title)
 # 打印结果
 print(f"2024年3月份发布的文章数量: {march_2024_count}")
 print(f"所有文章标题:")
 for title in march_2024_titles:
    print(title)