You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
77 lines
2.9 KiB
77 lines
2.9 KiB
import sys
|
|
# from imp import reload
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
# 设置请求头,伪装成浏览器
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'}
|
|
|
|
# 设置目标url
|
|
url = 'https://book.douban.com/top250?start=0'
|
|
|
|
# 使用requests模块提供的的Http服务
|
|
resp = requests.get(url, headers=headers)
|
|
|
|
# 如果目标网页编码与本地不一致,修改本地默认编码方式(防止输出中文乱码)
|
|
# if sys.getdefaultencoding() != resp.encoding:
|
|
# reload(sys)
|
|
# sys.setdefaultencoding(resp.encoding)
|
|
|
|
# 使用BeautifulSoup来分析和定位Html中我们所需的内容
|
|
soup = BeautifulSoup(resp.text, 'lxml') # 构建BeautifulSoup对象
|
|
|
|
# 创建记录数据的文件
|
|
file_out = "C:\\Users\雷雨\pycharmProjects\pythonProject1\out.txt" # Windows下文件名分隔符为 " \\ "
|
|
|
|
i = 0
|
|
|
|
# 使用BeautifulSoup中的select方法(返回的是list对象)获取所需内容
|
|
|
|
name_list = soup.select('tr > td > div.pl2 > a') # 设置select方法中的参数可以用XPath也可以直接通过标签结点层层递进
|
|
|
|
# 定义一个结果列表
|
|
|
|
result_list = ["豆瓣图书排行榜"]
|
|
|
|
|
|
# 定义一个查重并更新列表的方法,如果当前元素已存在于列表中则不做更新操作
|
|
def FindIfRepeat(OldList, item):
|
|
if item not in OldList:
|
|
NewList = OldList.append(item)
|
|
return NewList
|
|
else:
|
|
return OldList
|
|
|
|
|
|
# 用for循环遍历soup对象返回的列表
|
|
|
|
for name in name_list:
|
|
i += 1
|
|
outcomeList = name.get_text().strip().split() # 对于返回的结果进行分段并去掉两端空格
|
|
for outcome in outcomeList:
|
|
m = len(outcomeList)
|
|
if m >= 2: # 判断是否有对书的描述字段
|
|
|
|
# 使用迭代器遍历outcomelist
|
|
Newoutcome = ""
|
|
it = iter(outcomeList)
|
|
for x in it:
|
|
Newoutcome = Newoutcome + str(x) + " "
|
|
Newoutcome = Newoutcome.split(':', 1)
|
|
# 对于获取的结果做格式化处理
|
|
name_format = "Top{rank},书名《{name}》,描述:{description}".format(rank=str(i), name=Newoutcome[0],
|
|
description=Newoutcome[1])
|
|
FindIfRepeat(result_list, name_format) # 调用查重函数进行查重更新结果列表的操作
|
|
else:
|
|
name_format = "Top{rank},书名《{name}》,描述:暂无".format(rank=str(i), name=outcomeList[0])
|
|
FindIfRepeat(result_list, name_format)
|
|
|
|
# 将结果写入文件中
|
|
for i in range(len(result_list)):
|
|
with open(file_out, "a+",encoding="utf-8") as f: # 使用with...open格式打开文件的好处在于不用手动关闭文件
|
|
f.write(str(result_list[i]) + '\r\n')
|
|
|
|
|