import requests from bs4 import BeautifulSoup from urllib.request import quote import time f = open('huiyi_01.txt','w',encoding ='utf-8') #创建txt格式文件,方便等会存储 #添加请求头,模拟浏览器正常访问,避免被反爬虫 headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36' } time.sleep(5) # headers={'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0'} #爬取的数量 for x in range(5): url = 'https://www.zhangqiaokeyan.com/academic-conference-cn_1/' res = requests.get(url, headers=headers) # 查看是否能获取数据 print(res.status_code) # 解析数据 bs1 = BeautifulSoup(res.text, 'html.parser') list_titles = bs1.find_all('li', class_="list_item") for i in list_titles: # 标题 title = i.find('div', class_="itme_title").text print(title) f.write("题目:"+title.strip()+'\t') # 获取文章跳转链接 half_link = i.find('div', class_="itme_title").find('a')['href'] wholelink = 'http:' + str(half_link) print(wholelink) f.write("链接:" + wholelink.strip()+'\t') # 获取主办单位 s1=i.select_one('.item_mid > span').text print(s1) f.write("举办单位:" + s1.strip()+'\t') # 举办时间 time = i.select_one('.item_right > span').text print(time) f.write("时间:" + time.strip()) f.write('\n') f.close()