ADD file via upload

2 years ago · 826f740270
parent 6e2226df2b
commit 826f740270
1 changed files with 154 additions and 0 deletions
--- a/fourbook.py
+++ b/fourbook.py
@ -0,0 +1,154 @@
+import json
+
+import parsel
+import requests
+from bs4 import BeautifulSoup
+
+
+def findNovelContent(url, name):
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
+                      'Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41'
+    }  # 伪装成浏览器防止被反爬虫
+    response = requests.get(url=url, headers=headers).text  # 获取网页源码
+    soup = BeautifulSoup(response, 'lxml')  # 创建beautifulsoup对象，便于下面的解析操作
+    li_list = soup.select('.ml_list>ul>li')  # 获取class='ml_list'下ul标签中所有li标签的内容
+    fp = open(name + '.txt', 'w', encoding='UTF-8')  # 以写的方式创建一个文本文件编码方式为utf-8
+    for li in li_list:
+        title = li.a.string  # 获取章节标题：通过网页源码可知标题在a标签中的string中
+        detail_url = 'https://www.1234u.net' + li.a['href']  # href存放进一步网址，将网页的前半部分与章节的href连接起来，即可获取完整的章节网址
+        detail_page_text = requests.get(url=detail_url, headers=headers).text  # 获取章节内容的网页源码
+        detail_soup = BeautifulSoup(detail_page_text, 'lxml')   # 创建beautifulsoup对象，便于解析操作
+        div_tag = detail_soup.find('p', class_="articlecontent")  # 爬取章节内容：正文在class为articlecontent的p标签下
+        content = div_tag.text # 爬取章节内容：正文在class为articlecontent的p标签下，将其中的text内容保留下来
+        fp.write(title + ':' + content + '\n')  # 将内容写入文件
+        print(title, '爬取成功！！！')
+
+
+def findNovelPrice(item):
+  #  for item in iter(book_list):  # 逐元素遍历book_list
+        ls = []  # 新建空列表，用来保存之后存入的数据
+        print("开始爬取{0}的信息".format(item))
+        for page in range(1, 99):
+            print(f'========================正在保存第{page}页数据内容===================================')
+            url = f'https://search.dangdang.com/?key={item}&act=input&page_index={page}'  # item为书名，page为页数
+            headers = {
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
+                              'Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41'}  # 请求头信息
+            response = requests.get(url=url, headers=headers)  # 向服务器发送请求
+            selector = parsel.Selector(response.text)  # 初始化parsel对象
+            lis = selector.css('.bigimg li')  # 获取到所有的li标签
+            for li in lis:  # 从li标签中提取到书籍所有的信息
+                title = li.css('.name a::attr(title)').get()  # 标题/书名
+                price_n = li.css('.price .search_now_price::text').get()  # 售价
+                picture = li.css('.pic img::attr(data-original)').get()  # 图片地址
+                dit = {
+                    '标题': title,
+                    '售价': price_n,
+                    '图片地址': picture,
+                }  # 创建字典存放获取信息
+                ls.append(dit)  # 将该字典嵌套存放在ls列表中
+        file_name = str(item) + '.json'  # 将book_list中元素附带上后缀后作为文件名
+        fw = open(file_name, "w", encoding='utf-8')
+        a = json.dumps(ls[1:], sort_keys=False, indent=4, ensure_ascii=False)  # 转换为json对象
+        fw.write(a)  # 将json对象写入文件
+        fw.close()  # 关闭文件
+
+
+def findNovelComment(novelname, url):
+    # 伪装成浏览器防止被反爬虫
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
+                      "Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41"
+    }
+    # 获取评论的地址
+    res = requests.get(url=url, headers=headers)
+    res = BeautifulSoup(res.text, "lxml")  # 将爬取的数据改成lxml的格式
+
+    comments = res.find("div", id="comments")
+    p_list = comments.find_all("p")     # 将评论所在的p标签全部存入列表中，方便下面读取操作
+
+    # 以下操作将读取到的评论写入的txt文本中
+    data_list = []  # 创建空列表用来放评论
+    for i in p_list:
+        data_list.append(i.find("span").string)  # 1.评论在p下的span标签中，将内容先存入列表中
+    novelname1 = f"{novelname} 的豆瓣评论"
+    n=0
+    with open(novelname1+".txt", "w", encoding='utf-8')as f:
+        for i in data_list:  # 将列表内容写入文本文件
+            n=int(n)
+            n=n+1
+            n=str(n)
+            f.write("(" + n + ")" + i + "\n")  # 将每段评论换行写入
+
+
+
+if __name__ == '__main__' :
+
+    select = input("请您选择进行下列爬取内容：（1）四大名著小说；（2）四大名著价格信息；（3）四大名著豆瓣评论：")
+    select = int(select)
+    if select == 1:
+        print("#" * 20)
+        # 小说内容
+        i=0
+        book = input("请输入您想查询的四大名著：")
+        param = {'三国演义': 'https://www.1234u.net/xiaoshuo/121/121220/',
+                 '水浒传': 'https://www.1234u.net/141/141647/',
+                 '红楼梦': 'https://www.1234u.net/192/192648/',
+                 '西游记': '2https://www.1234u.net/157/157874/'}
+        for a, b in param.items():
+            if a == book:
+                findNovelContent(b, a)
+                break
+            else:
+                i=i+1
+                if i >= 4:
+                    print("您输入的不是四大名著！")
+                    break
+                continue
+
+    elif select == 2:
+        print("#" * 20)
+        i = 0
+        book = input("请输入您想查询的四大名著：")
+        # 小说价格，名称，图片地址
+        book_list = ['三国演义', '红楼梦', '水浒传', '西游记']
+        for item in book_list:
+            if item == book :
+                findNovelPrice(item)
+                break
+            else:
+                i=i+1
+                if i >= 4:
+                    print("您输入的不是四大名著！")
+                    break
+                continue
+        print("价格爬取结束！")
+
+    else:
+        # 爬小说书评
+        print("#" * 20)
+        i = 0
+        book = input("请输入您想查询的四大名著：")
+        param1 = {
+            '西游记': "https://book.douban.com/subject/30137808/comments/?start=0&limit=20&status=P&sort"
+                                       "=new_score",
+            '红楼梦': "https://book.douban.com/subject/1007305/comments/?start=0&limit=20&status=P&sort"
+                                       "=new_score",
+            '水浒传': "https://book.douban.com/subject/30137810/comments/?start=0&limit=20&status=P&sort"
+                                       "=new_score",
+            '三国演义': "https://book.douban.com/subject/26416768/comments/?start=0&limit=20&status=P&sort"
+                                        "=new_score"}
+        for name, url in param1.items():
+            if name == book:
+                findNovelComment(name, url)
+                print("评语爬取成功！！")
+                break
+            else:
+                i=i+1
+                if i >= 4:
+                    print("您输入的不是四大名著！")
+                    break
+                continue
+
+