diff --git a/README.md b/README.md index 7d203ec..be65fb3 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,65 @@ -# import +import requests +import re +headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'} +urls = ["https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen.html"] +for i in range(2, 21): + url = f"https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen{-i}.html" + urls.append(url) + +for url in urls: + response = requests.get(url, headers=headers) + response.raise_for_status() # 检查请求是否成功 + # 打印响应编码和状态码 + print(f'URL: {url}: Encoding - {response.encoding}, Status Code - {response.status_code}') + # 打印网页源代码 + if url == "https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen.html" and response.status_code == 200: + print("Page 1 Source Code:") + print(response.text[:800]) + print() # 空行分隔不同页面的输出 + + pattern = r'
  • (.*?)(.*?)
  • ' + # 使用re.findall找到所有匹配项 + matches = re.findall(pattern, response.text, re.DOTALL) + # 打印结果 + for match in matches: + link, title, _, date = match + print(f"链接: {link}") + print(f"标题: {title}") + print(f"发布时间: {date}") + print() + +# 匹配标题(包含'产教融合')、链接和发布时间 +for url in urls: + response = requests.get(url, headers=headers) + response.raise_for_status() # 检查请求是否成功 + pattern = re.compile(r'
  • (.+?产教融合.+?)([^<]+)
  • ') + matches = pattern.findall(response.text) + # 打印结果 + for match in matches: + link, title, content, time = match + print(f"标题: {title.strip()}") + print(f"内容: {content.strip()}") + print(f"时间: {time.strip()}") + print(f"链接: {link}") + print() + +# 提取2024年3月份发布的文章数量以及所有文章标题 +march_2024_count = 0 +march_2024_titles = [] +for url in urls: + response = requests.get(url, headers=headers) + response.raise_for_status() # 检查请求是否成功 + pattern = r'
  • .*?(2024-03-\d\d)
  • ' + # 使用re.findall找到所有匹配的标题和日期 + matches = re.findall(pattern, response.text) + # 遍历匹配结果 + for title, date in matches: + if date.startswith('2024-03-'): + march_2024_count += 1 + march_2024_titles.append(title) +# 打印结果 +print(f"2024年3月份发布的文章数量: {march_2024_count}") +print(f"所有文章标题:") +for title in march_2024_titles: + print(title) \ No newline at end of file