parent
							
								
									7dc8dc4445
								
							
						
					
					
						commit
						d993acfad0
					
				| @ -1,2 +1,65 @@ | |||||||
| # import | import requests | ||||||
|  | import re | ||||||
| 
 | 
 | ||||||
|  | headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'} | ||||||
|  | urls = ["https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen.html"] | ||||||
|  | for i in range(2, 21): | ||||||
|  |     url = f"https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen{-i}.html" | ||||||
|  |     urls.append(url) | ||||||
|  | 
 | ||||||
|  | for url in urls: | ||||||
|  |     response = requests.get(url, headers=headers) | ||||||
|  |     response.raise_for_status()  # 检查请求是否成功 | ||||||
|  |     # 打印响应编码和状态码 | ||||||
|  |     print(f'URL: {url}: Encoding - {response.encoding}, Status Code - {response.status_code}') | ||||||
|  |     # 打印网页源代码 | ||||||
|  |     if url == "https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen.html" and response.status_code == 200: | ||||||
|  |         print("Page 1 Source Code:") | ||||||
|  |         print(response.text[:800]) | ||||||
|  |     print()  # 空行分隔不同页面的输出 | ||||||
|  | 
 | ||||||
|  |     pattern = r'<li><a href="(.*?)" title="(.*?)"><span class="fl"><img.*?>(.*?)</span><span class="fr timee">(.*?)</span></a></li>' | ||||||
|  |     # 使用re.findall找到所有匹配项 | ||||||
|  |     matches = re.findall(pattern, response.text, re.DOTALL) | ||||||
|  |     # 打印结果 | ||||||
|  |     for match in matches: | ||||||
|  |         link, title, _, date = match | ||||||
|  |         print(f"链接: {link}") | ||||||
|  |         print(f"标题: {title}") | ||||||
|  |         print(f"发布时间: {date}") | ||||||
|  |         print() | ||||||
|  | 
 | ||||||
|  | # 匹配标题(包含'产教融合')、链接和发布时间 | ||||||
|  | for url in urls: | ||||||
|  |     response = requests.get(url, headers=headers) | ||||||
|  |     response.raise_for_status()  # 检查请求是否成功 | ||||||
|  |     pattern = re.compile(r'<li><a href="([^"]+)" title="([^"]+产教融合[^"]+)"><span class="fl">(.+?产教融合.+?)</span><span class="fr timee">([^<]+)</span></a></li>') | ||||||
|  |     matches = pattern.findall(response.text) | ||||||
|  |     # 打印结果 | ||||||
|  |     for match in matches: | ||||||
|  |         link, title, content, time = match | ||||||
|  |         print(f"标题: {title.strip()}") | ||||||
|  |         print(f"内容: {content.strip()}") | ||||||
|  |         print(f"时间: {time.strip()}") | ||||||
|  |         print(f"链接: {link}") | ||||||
|  |         print() | ||||||
|  | 
 | ||||||
|  | # 提取2024年3月份发布的文章数量以及所有文章标题 | ||||||
|  | march_2024_count = 0 | ||||||
|  | march_2024_titles = [] | ||||||
|  | for url in urls: | ||||||
|  |     response = requests.get(url, headers=headers) | ||||||
|  |     response.raise_for_status()  # 检查请求是否成功 | ||||||
|  |     pattern = r'<li><a href=".*?" title="(.*?)"><span class="fl">.*?</span><span class="fr timee">(2024-03-\d\d)</span></a></li>' | ||||||
|  |     # 使用re.findall找到所有匹配的标题和日期 | ||||||
|  |     matches = re.findall(pattern, response.text) | ||||||
|  |     # 遍历匹配结果 | ||||||
|  |     for title, date in matches: | ||||||
|  |         if date.startswith('2024-03-'): | ||||||
|  |             march_2024_count += 1 | ||||||
|  |             march_2024_titles.append(title) | ||||||
|  | # 打印结果 | ||||||
|  | print(f"2024年3月份发布的文章数量: {march_2024_count}") | ||||||
|  | print(f"所有文章标题:") | ||||||
|  | for title in march_2024_titles: | ||||||
|  |     print(title) | ||||||
					Loading…
					
					
				
		Reference in new issue