From e4f8dc17754bf41b3c5ed0f2401f9803dc099cc7 Mon Sep 17 00:00:00 2001 From: po5alxukh <1752757363@qq.com> Date: Sat, 18 Feb 2023 14:02:12 +0800 Subject: [PATCH] ADD file via upload --- 掌桥test01.py | 51 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 掌桥test01.py diff --git a/掌桥test01.py b/掌桥test01.py new file mode 100644 index 0000000..45f5607 --- /dev/null +++ b/掌桥test01.py @@ -0,0 +1,51 @@ +import requests +from bs4 import BeautifulSoup +from urllib.request import quote +import time + + +f = open('huiyi_01.txt','w',encoding ='utf-8') #创建txt格式文件,方便等会存储 +#添加请求头,模拟浏览器正常访问,避免被反爬虫 +headers={ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36' +} +time.sleep(5) +# headers={'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0'} +#爬取的数量 +for x in range(5): + url = 'https://www.zhangqiaokeyan.com/academic-conference-cn_1/' + res = requests.get(url, headers=headers) + # 查看是否能获取数据 + print(res.status_code) + + # 解析数据 + bs1 = BeautifulSoup(res.text, 'html.parser') + list_titles = bs1.find_all('li', class_="list_item") + for i in list_titles: + + # 标题 + title = i.find('div', class_="itme_title").text + print(title) + f.write("题目:"+title.strip()+'\t') + + # 获取文章跳转链接 + half_link = i.find('div', class_="itme_title").find('a')['href'] + wholelink = 'http:' + str(half_link) + print(wholelink) + f.write("链接:" + wholelink.strip()+'\t') + + # 获取主办单位 + s1=i.select_one('.item_mid > span').text + print(s1) + f.write("举办单位:" + s1.strip()+'\t') + + # 举办时间 + time = i.select_one('.item_right > span').text + print(time) + f.write("时间:" + time.strip()) + + f.write('\n') + +f.close() + +