You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.
import requests
from bs4 import BeautifulSoup
from urllib . request import quote
import time
f = open ( ' huiyi_01.txt ' , ' w ' , encoding = ' utf-8 ' ) #创建txt格式文件, 方便等会存储
#添加请求头,模拟浏览器正常访问,避免被反爬虫
headers = {
' User-Agent ' : ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 '
}
time . sleep ( 5 )
# headers={'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0'}
#爬取的数量
for x in range ( 5 ) :
url = ' https://www.zhangqiaokeyan.com/academic-conference-cn_1/ '
res = requests . get ( url , headers = headers )
# 查看是否能获取数据
print ( res . status_code )
# 解析数据
bs1 = BeautifulSoup ( res . text , ' html.parser ' )
list_titles = bs1 . find_all ( ' li ' , class_ = " list_item " )
for i in list_titles :
# 标题
title = i . find ( ' div ' , class_ = " itme_title " ) . text
print ( title )
f . write ( " 题目: " + title . strip ( ) + ' \t ' )
# 获取文章跳转链接
half_link = i . find ( ' div ' , class_ = " itme_title " ) . find ( ' a ' ) [ ' href ' ]
wholelink = ' http: ' + str ( half_link )
print ( wholelink )
f . write ( " 链接: " + wholelink . strip ( ) + ' \t ' )
# 获取主办单位
s1 = i . select_one ( ' .item_mid > span ' ) . text
print ( s1 )
f . write ( " 举办单位: " + s1 . strip ( ) + ' \t ' )
# 举办时间
time = i . select_one ( ' .item_right > span ' ) . text
print ( time )
f . write ( " 时间: " + time . strip ( ) )
f . write ( ' \n ' )
f . close ( )