You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
|
|
import requests
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
from urllib.request import quote
|
|
|
|
|
import time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
f = open('huiyi_01.txt','w',encoding ='utf-8') #创建txt格式文件,方便等会存储
|
|
|
|
|
#添加请求头,模拟浏览器正常访问,避免被反爬虫
|
|
|
|
|
headers={
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
|
|
|
|
|
}
|
|
|
|
|
time.sleep(5)
|
|
|
|
|
# headers={'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0'}
|
|
|
|
|
#爬取的数量
|
|
|
|
|
for x in range(5):
|
|
|
|
|
url = 'https://www.zhangqiaokeyan.com/academic-conference-cn_1/'
|
|
|
|
|
res = requests.get(url, headers=headers)
|
|
|
|
|
# 查看是否能获取数据
|
|
|
|
|
print(res.status_code)
|
|
|
|
|
|
|
|
|
|
# 解析数据
|
|
|
|
|
bs1 = BeautifulSoup(res.text, 'html.parser')
|
|
|
|
|
list_titles = bs1.find_all('li', class_="list_item")
|
|
|
|
|
for i in list_titles:
|
|
|
|
|
|
|
|
|
|
# 标题
|
|
|
|
|
title = i.find('div', class_="itme_title").text
|
|
|
|
|
print(title)
|
|
|
|
|
f.write("题目:"+title.strip()+'\t')
|
|
|
|
|
|
|
|
|
|
# 获取文章跳转链接
|
|
|
|
|
half_link = i.find('div', class_="itme_title").find('a')['href']
|
|
|
|
|
wholelink = 'http:' + str(half_link)
|
|
|
|
|
print(wholelink)
|
|
|
|
|
f.write("链接:" + wholelink.strip()+'\t')
|
|
|
|
|
|
|
|
|
|
# 获取主办单位
|
|
|
|
|
s1=i.select_one('.item_mid > span').text
|
|
|
|
|
print(s1)
|
|
|
|
|
f.write("举办单位:" + s1.strip()+'\t')
|
|
|
|
|
|
|
|
|
|
# 举办时间
|
|
|
|
|
time = i.select_one('.item_right > span').text
|
|
|
|
|
print(time)
|
|
|
|
|
f.write("时间:" + time.strip())
|
|
|
|
|
|
|
|
|
|
f.write('\n')
|
|
|
|
|
|
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
|
|
|