ADD file via upload

5 months ago · a0609203b7
parent 56923e126c
commit a0609203b7
1 changed files with 76 additions and 0 deletions
--- a/ly.py
+++ b/ly.py
@ -0,0 +1,76 @@
+import sys
+# from imp import reload
+import requests
+from bs4 import BeautifulSoup
+
+
+# 设置请求头，伪装成浏览器
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'}
+
+# 设置目标url
+url = 'https://book.douban.com/top250?start=0'
+
+# 使用requests模块提供的的Http服务
+resp = requests.get(url, headers=headers)
+
+# 如果目标网页编码与本地不一致，修改本地默认编码方式（防止输出中文乱码）
+# if sys.getdefaultencoding() != resp.encoding:
+#     reload(sys)
+#     sys.setdefaultencoding(resp.encoding)
+
+# 使用BeautifulSoup来分析和定位Html中我们所需的内容
+soup = BeautifulSoup(resp.text, 'lxml')  # 构建BeautifulSoup对象
+
+# 创建记录数据的文件
+file_out = "C:\\Users\雷雨\pycharmProjects\pythonProject1\out.txt"  # Windows下文件名分隔符为  " \\ "
+
+i = 0
+
+# 使用BeautifulSoup中的select方法(返回的是list对象)获取所需内容
+
+name_list = soup.select('tr > td > div.pl2 > a')  # 设置select方法中的参数可以用XPath也可以直接通过标签结点层层递进
+
+# 定义一个结果列表
+
+result_list = ["豆瓣图书排行榜"]
+
+
+# 定义一个查重并更新列表的方法,如果当前元素已存在于列表中则不做更新操作
+def FindIfRepeat(OldList, item):
+    if item not in OldList:
+        NewList = OldList.append(item)
+        return NewList
+    else:
+        return OldList
+
+
+# 用for循环遍历soup对象返回的列表
+
+for name in name_list:
+    i += 1
+    outcomeList = name.get_text().strip().split()  # 对于返回的结果进行分段并去掉两端空格
+    for outcome in outcomeList:
+        m = len(outcomeList)
+        if m >= 2:  # 判断是否有对书的描述字段
+
+            # 使用迭代器遍历outcomelist
+            Newoutcome = ""
+            it = iter(outcomeList)
+            for x in it:
+                Newoutcome = Newoutcome + str(x) + " "
+            Newoutcome = Newoutcome.split(':', 1)
+            # 对于获取的结果做格式化处理
+            name_format = "Top{rank}，书名《{name}》，描述：{description}".format(rank=str(i), name=Newoutcome[0],
+                                                                             description=Newoutcome[1])
+            FindIfRepeat(result_list, name_format)  # 调用查重函数进行查重更新结果列表的操作
+        else:
+            name_format = "Top{rank}，书名《{name}》，描述：暂无".format(rank=str(i), name=outcomeList[0])
+            FindIfRepeat(result_list, name_format)
+
+# 将结果写入文件中
+for i in range(len(result_list)):
+    with open(file_out, "a+",encoding="utf-8") as f:  # 使用with...open格式打开文件的好处在于不用手动关闭文件
+        f.write(str(result_list[i]) + '\r\n')
+
+