commit 2d0a10ec2c2d991f2454d9e9854d632d556b268d
Author: kun <13098278+w52020031129@user.noreply.gitee.com>
Date: Mon May 27 10:49:22 2024 +0800
5-27
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..35410ca
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# 默认忽略的文件
+/shelf/
+/workspace.xml
+# 基于编辑器的 HTTP 客户端请求
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000..7c4836f
--- /dev/null
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..a44514f
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..31c3df9
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..94a25f7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/豆瓣读书250爬虫_20220926.iml b/.idea/豆瓣读书250爬虫_20220926.iml
new file mode 100644
index 0000000..36928bc
--- /dev/null
+++ b/.idea/豆瓣读书250爬虫_20220926.iml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/11_DouBan250Spider.py b/11_DouBan250Spider.py
new file mode 100644
index 0000000..545babe
--- /dev/null
+++ b/11_DouBan250Spider.py
@@ -0,0 +1,99 @@
+
+import requests # 发送请求
+from bs4 import BeautifulSoup # 解析网页
+import pandas as pd # 存取csv
+from time import sleep # 等待时间
+
+book_name = [] # 书名
+book_url = [] # 书籍链接
+book_star = [] # 书籍评分
+book_star_people = [] # 评分人数
+book_author = [] # 书籍作者
+book_translater = [] # 书籍译者
+book_publisher = [] # 出版社
+book_pub_year = [] # 出版日期
+book_price = [] # 书籍价格
+book_comment = [] # 一句话评价
+
+
+def get_book_info(url, headers):
+ res = requests.get(url, headers=headers)
+ soup = BeautifulSoup(res.text, 'html.parser')
+ for book in soup.select('.item'):
+ name = book.select('.pl2 a')[0]['title'] # 书名
+ book_name.append(name)
+ bkurl = book.select('.pl2 a')[0]['href'] # 书籍链接
+ book_url.append(bkurl)
+ star = book.select('.rating_nums')[0].text # 书籍评分
+ book_star.append(star)
+ star_people = book.select('.pl')[1].text # 评分人数
+ star_people = star_people.strip().replace(' ', '').replace('人评价', '').replace('(\n', '').replace('\n)',
+ '') # 数据清洗
+ book_star_people.append(star_people)
+
+ # 没有一句话评价,比如倒数第二名,君主论
+ if book.select('.quote span'):
+ book_comment.append(book.select('.quote span')[0].text)
+ else:
+ book_comment.append(None)
+
+ info = book.select('.pl')[0].text.split('/')
+ if len(info) == 5: # 正常情况
+ book_author.append(info[0])
+ book_translater.append(info[1])
+ book_publisher.append(info[2])
+ book_pub_year.append(info[3])
+ book_price.append(str(info[4]))
+ elif len(info) == 4: # 没有译者,比如:第一名,红楼梦
+ book_author.append(info[0])
+ book_translater.append(None)
+ book_publisher.append(info[1])
+ book_pub_year.append(info[2])
+ book_price.append(str(info[3]))
+ elif len(info) == 6: # 有2个价格,比如:第一页,福尔摩斯探案全集(上中下)
+ book_author.append(info[0])
+ book_translater.append(info[1])
+ book_publisher.append(info[2])
+ book_pub_year.append(info[3])
+ book_price.append(str(info[4]) + '/' + str(info[5]))
+ elif len(info) == 3: # 没有作者,且没有译者,比如:第5页,十万个为什么
+ book_author.append(None)
+ book_translater.append(None)
+ book_publisher.append(info[0])
+ book_pub_year.append(info[1])
+ book_price.append(str(info[2]))
+ else:
+ pass
+
+
+def save_to_csv(csv_name):
+ """
+ 数据保存到csv
+ :return: None
+ """
+ df = pd.DataFrame() # 初始化一个DataFrame对象
+ df['书名'] = book_name
+ df['豆瓣链接'] = book_url
+ df['作者'] = book_author
+ df['译者'] = book_translater
+ df['出版社'] = book_publisher
+ df['出版日期'] = book_pub_year
+ df['价格'] = book_price
+ df['评分'] = book_star
+ df['评分人数'] = book_star_people
+ df['一句话评价'] = book_comment
+ df.to_csv(csv_name, encoding='utf_8_sig') # 将数据保存到csv文件
+
+
+if __name__ == "__main__":
+ # 定义一个请求头
+ headers = {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
+ # 开始爬取豆瓣数据
+ for i in range(1): # 爬取共10页,每页25条数据
+ page_url = 'https://book.douban.com/top250?start={}'.format(str(i * 25))
+ print('开始爬取第{}页,地址是:{}'.format(str(i + 1), page_url))
+ get_book_info(page_url, headers)
+ sleep(1) # 等待1秒
+ # 保存到csv文件
+ save_to_csv(csv_name="BookDouban250.csv")
diff --git a/BookDouban250.csv b/BookDouban250.csv
new file mode 100644
index 0000000..26f7d86
--- /dev/null
+++ b/BookDouban250.csv
@@ -0,0 +1,226 @@
+书名,豆瓣链接,作者,译者,出版社,出版日期,价格,评分,评分人数,一句话评价
+红楼梦,https://book.douban.com/subject/1007305/,[清] 曹雪芹 著 ,, 人民文学出版社 , 1996-12 , 59.70元,9.6,"(
+ 424383人评价
+ )",都云作者痴,谁解其中味?
+活着,https://book.douban.com/subject/4913064/,余华 ,, 作家出版社 , 2012-8 , 20.00元,9.4,"(
+ 834408人评价
+ )",生的苦难与伟大
+1984,https://book.douban.com/subject/4820710/,[英] 乔治·奥威尔 , 刘绍铭 , 北京十月文艺出版社 , 2010-4-1 , 28.00,9.4,"(
+ 284757人评价
+ )",栗树荫下,我出卖你,你出卖我
+哈利·波特,https://book.douban.com/subject/24531956/,J.K.罗琳 (J.K.Rowling) , 苏农 , 人民文学出版社 , 2008-12-1 , 498.00元,9.7,"(
+ 107448人评价
+ )",从9¾站台开始的旅程
+三体全集,https://book.douban.com/subject/6518605/,刘慈欣 ,, 重庆出版社 , 2012-1 , 168.00元,9.5,"(
+ 187889人评价
+ )",地球往事三部曲
+百年孤独,https://book.douban.com/subject/6082808/,[哥伦比亚] 加西亚·马尔克斯 , 范晔 , 南海出版公司 , 2011-6 , 39.50元,9.3,"(
+ 426035人评价
+ )",魔幻现实主义文学代表作
+飘,https://book.douban.com/subject/1068920/,[美国] 玛格丽特·米切尔 , 李美华 , 译林出版社 , 2000-9 , 40.00元,9.3,"(
+ 212637人评价
+ )",革命时期的爱情,随风而逝
+动物农场,https://book.douban.com/subject/2035179/,[英] 乔治·奥威尔 , 荣如德 , 上海译文出版社 , 2007-3 , 10.00元,9.3,"(
+ 160188人评价
+ )",太阳底下并无新事
+房思琪的初恋乐园,https://book.douban.com/subject/27614904/,林奕含 ,, 北京联合出版公司 , 2018-2 , 45.00元,9.2,"(
+ 380002人评价
+ )",向死而生的文学绝唱
+三国演义(全二册),https://book.douban.com/subject/1019568/,[明] 罗贯中 ,, 人民文学出版社 , 1998-05 , 39.50元,9.3,"(
+ 168645人评价
+ )",是非成败转头空
+福尔摩斯探案全集(上中下),https://book.douban.com/subject/1040211/,[英] 阿·柯南道尔 , 丁钟华 等 , 群众出版社 , 1981-8 , 53.00元/68.00元,9.3,"(
+ 133712人评价
+ )",名侦探的代名词
+白夜行,https://book.douban.com/subject/10554308/,[日] 东野圭吾 , 刘姿君 , 南海出版公司 , 2013-1-1 , 39.50元,9.2,"(
+ 475686人评价
+ )",一宗离奇命案牵出跨度近20年步步惊心的故事
+小王子,https://book.douban.com/subject/1084336/,[法] 圣埃克苏佩里 , 马振骋 , 人民文学出版社 , 2003-8 , 22.00元,9.1,"(
+ 756658人评价
+ )",献给长成了大人的孩子们
+安徒生童话故事集,https://book.douban.com/subject/1046209/,(丹麦)安徒生 , 叶君健 , 人民文学出版社 , 1997-08 , 25.00元,9.3,"(
+ 132841人评价
+ )",为了争取未来的一代
+天龙八部,https://book.douban.com/subject/1255625/,金庸 ,, 生活·读书·新知三联书店 , 1994-5 , 96.00元,9.2,"(
+ 133043人评价
+ )",有情皆孽,无人不冤
+呐喊,https://book.douban.com/subject/1449351/,鲁迅 ,, 人民文学出版社 , 1973-3 , 0.36元,9.2,"(
+ 160624人评价
+ )",新文学的第一声呐喊
+撒哈拉的故事,https://book.douban.com/subject/1060068/,三毛 ,, 哈尔滨出版社 , 2003-8 , 15.80元,9.2,"(
+ 175143人评价
+ )",游荡的自由灵魂
+悉达多,https://book.douban.com/subject/26980487/,[德] 赫尔曼·黑塞 , 姜乙 , 天津人民出版社 , 2017-1 , 32.00元,9.3,"(
+ 105727人评价
+ )",
+邓小平时代,https://book.douban.com/subject/20424526/,【美】傅高义 (Ezra.F.Vogel) , 冯克利 , 生活·读书·新知三联书店 , 2013-1-18 , 88.00元,9.3,"(
+ 70891人评价
+ )",个人命运背后的历史变局
+杀死一只知更鸟,https://book.douban.com/subject/6781808/,[美] 哈珀·李 , 高红梅 , 译林出版社 , 2012-9 , 32.00元,9.2,"(
+ 146983人评价
+ )",有一种东西不能遵循从众原则,那就是——人的良心
+失踪的孩子,https://book.douban.com/subject/30172069/,[意] 埃莱娜·费兰特 , 陈英 , 人民文学出版社 , 2018-7 , 62.00元,9.2,"(
+ 81644人评价
+ )",我的整个生命,只是一场为了提升社会地位的低俗斗争。
+明朝那些事儿(1-9),https://book.douban.com/subject/3674537/,当年明月 ,, 中国海关出版社 , 2009-4 , 358.20元,9.2,"(
+ 175139人评价
+ )",不拘一格的历史书写
+新名字的故事,https://book.douban.com/subject/26986954/,[意] 埃莱娜·费兰特 , 陈英 , 人民文学出版社 , 2017-4 , 59.00元,9.2,"(
+ 92225人评价
+ )",探索青年时代的激情、困惑、挣扎、背叛和失去
+野草,https://book.douban.com/subject/1915958/,鲁迅 ,, 人民文学出版社 , 1973-3 , 0.20元,9.5,"(
+ 47503人评价
+ )",我以这一丛野草,在明与暗,生与死,过去与未来之际,献于友与仇,人与兽,爱者与不爱者之前作证。
+沉默的大多数,https://book.douban.com/subject/1054685/,王小波 ,, 中国青年出版社 , 1997-10 , 27.00元,9.1,"(
+ 151597人评价
+ )",沉默是沉默者的通行证
+中国历代政治得失,https://book.douban.com/subject/1003479/,钱穆 ,, 生活·读书·新知三联书店 , 2001 , 12.00元,9.2,"(
+ 72960人评价
+ )",一部简明的“中国政治制度史”
+局外人,https://book.douban.com/subject/4908885/,[法] 阿尔贝·加缪 , 柳鸣九 , 上海译文出版社 , 2010-8 , 22.00元,9.1,"(
+ 240586人评价
+ )",人生在世,永远也不该演戏作假
+乡土中国,https://book.douban.com/subject/1795079/,费孝通 ,, 上海人民出版社 , 2006-04-01 , 38.00,9.2,"(
+ 90709人评价
+ )",中国乡土社会传统文化和社会结构理论研究代表作
+白鹿原,https://book.douban.com/subject/10564071/,陈忠实 ,, 人民文学出版社 , 2012-9 , 39.00元,9.2,"(
+ 108128人评价
+ )",一轴关于我们民族灵魂的现实主义画卷
+卡拉马佐夫兄弟,https://book.douban.com/subject/25887924/,[俄] 费奥多尔·陀思妥耶夫斯基 , 荣如德 , 上海译文出版社 , 2015-2-1 , CNY 78.00,9.6,"(
+ 34343人评价
+ )",
+人类简史,https://book.douban.com/subject/25985021/,[以色列] 尤瓦尔·赫拉利 , 林俊宏 , 中信出版社 , 2014-11 , 68.00元,9.1,"(
+ 199868人评价
+ )",跟着人类一同走过十万年
+围城,https://book.douban.com/subject/1008145/,钱锺书 ,, 人民文学出版社 , 1991-2 , 19.00,9.0,"(
+ 447368人评价
+ )",幽默的语言和对生活深刻的观察
+彷徨,https://book.douban.com/subject/1449348/,鲁迅 ,, 人民文学出版社 , 1973-3 , 0.37元,9.3,"(
+ 65707人评价
+ )",路漫漫其修远兮,吾将上下而求索
+平凡的世界(全三部),https://book.douban.com/subject/1200840/,路遥 ,, 人民文学出版社 , 2005-1 , 64.00元,9.0,"(
+ 324539人评价
+ )",中国当代城乡生活全景
+罗杰疑案,https://book.douban.com/subject/21371175/,[英] 阿加莎·克里斯蒂 , 常禾 , 新星出版社 , 2013-3 , 28.00元,9.2,"(
+ 74837人评价
+ )",
+许三观卖血记,https://book.douban.com/subject/4760224/,余华 ,, 作家出版社 , 2012-9 , 24.00元,9.2,"(
+ 164226人评价
+ )",
+我与地坛,https://book.douban.com/subject/1209899/,史铁生 ,, 春风文艺出版社 , 2002-5 , 25.00元,9.2,"(
+ 109004人评价
+ )",这是你的罪孽与福祉
+笑傲江湖(全四册),https://book.douban.com/subject/1002299/,金庸 ,, 生活·读书·新知三联书店 , 1994-5 , 76.80元,9.1,"(
+ 111909人评价
+ )",欲练此功,必先自宫
+献给阿尔吉侬的花束,https://book.douban.com/subject/26362836/,[美] 丹尼尔·凯斯 , 陈澄和 , 广西师范大学出版社 , 2015-4 , 36.00元,9.1,"(
+ 109249人评价
+ )",当声称能改造智能的科学实验选中心智障碍主角
+东方快车谋杀案,https://book.douban.com/subject/1827374/,[英] 阿加莎·克里斯蒂 , 陈尧光 , 人民文学出版社 , 2006-5 , 18.00元,9.1,"(
+ 139814人评价
+ )",谋杀诡计惊人,波洛的抉择耐人寻味
+肖申克的救赎,https://book.douban.com/subject/1829226/,[美] 斯蒂芬·金 , 施寄青 , 人民文学出版社 , 2006-7 , 29.90元,9.1,"(
+ 121642人评价
+ )",豆瓣电影Top1原著
+江城,https://book.douban.com/subject/7060185/,[美] 彼得·海斯勒 , 李雪顺 , 上海译文出版社 , 2012-1 , 36.00元,9.1,"(
+ 63040人评价
+ )",外国人眼中的涪陵
+基督山伯爵,https://book.douban.com/subject/1085860/,[法国] 大仲马 , 周克希 , 上海译文出版社 , 1991-12-1 , 43.90元,9.1,"(
+ 129540人评价
+ )",一个报恩复仇的故事,以法国波旁王朝和七月王朝为背景
+城南旧事,https://book.douban.com/subject/1254588/,林海音 文 ,, 中国青年出版社 , 2003-7 , 16.00元,9.1,"(
+ 157423人评价
+ )",长亭外,古道边,芳草碧连天
+霍乱时期的爱情,https://book.douban.com/subject/10594787/,[哥伦比亚] 加西亚·马尔克斯 , 杨玲 , 南海出版公司 , 2012-9-1 , 39.50元,9.0,"(
+ 286118人评价
+ )",义无反顾地直达爱情的核心
+故事新编,https://book.douban.com/subject/2046909/,鲁迅 ,, 人民文学出版社 , 1973-12-01 , 0.31 元,9.4,"(
+ 43436人评价
+ )",拾取古代传说,取一点因由,随意点染
+艺术的故事,https://book.douban.com/subject/3162991/,[英] 贡布里希 (Sir E.H.Gombrich) , 范景中 , 广西美术出版社 , 2008-04 , 280.00,9.6,"(
+ 26388人评价
+ )",从最早的洞窟绘画到当今的实验艺术
+万历十五年,https://book.douban.com/subject/1041482/,[美] 黄仁宇 ,, 生活·读书·新知三联书店 , 1997-5 , 18.00元,9.0,"(
+ 202254人评价
+ )",见微知著,历史观的颠覆
+朝花夕拾,https://book.douban.com/subject/1449352/,鲁迅 ,, 人民文学出版社 , 1973-4 , 0.25元,9.1,"(
+ 155213人评价
+ )",在纷扰中寻出一点闲静
+月亮和六便士,https://book.douban.com/subject/1858513/,[英] 毛姆 , 傅惟慈 , 上海译文出版社 , 2006-8 , 15.00元,9.0,"(
+ 209821人评价
+ )",有多少人会经历顿悟,就有更少的人甘愿自我放逐
+厌女,https://book.douban.com/subject/25836270/,上野千鹤子 , 王兰 , 上海三联书店 , 2015-1 , 28.00,9.1,"(
+ 82066人评价
+ )",
+秋园,https://book.douban.com/subject/34998019/,杨本芬 ,, 北京联合出版公司 , 2020-6 , 38.00元,9.0,"(
+ 108614人评价
+ )",
+射雕英雄传,https://book.douban.com/subject/1044547/,金庸 ,, 生活·读书·新知三联书店 , 1999-04 , 47.00元,9.1,"(
+ 85813人评价
+ )",侠之大者,为国为民
+置身事内,https://book.douban.com/subject/35546622/,兰小欢 ,, 上海人民出版社 , 2021-8 , 65.00元,9.1,"(
+ 80263人评价
+ )",
+追风筝的人,https://book.douban.com/subject/1770782/,[美] 卡勒德·胡赛尼 , 李继宏 , 上海人民出版社 , 2006-5 , 29.00元,8.9,"(
+ 795265人评价
+ )",为你,千千万万遍
+树上的男爵,https://book.douban.com/subject/6789605/,[意大利]伊塔洛·卡尔维诺 , 吴正仪 , 译林出版社 , 2012-4-1 , 30.00元,9.1,"(
+ 60492人评价
+ )",是不是真的只有先与人疏离,才能最终与他们在一起?
+寻路中国,https://book.douban.com/subject/5414391/,[美] 彼得·海斯勒 , 李雪顺 , 上海译文出版社 , 2011-1 , 33.00元,9.0,"(
+ 56992人评价
+ )",《纽约客》驻北京记者驾车漫游中国大陆的经历
+刀锋,https://book.douban.com/subject/2035162/,[英]毛姆 , 周煦良 , 上海译文出版社 , 2007-3 , 18.00元,9.0,"(
+ 92048人评价
+ )",一把刀的锋刃不容易越过;因此智者说得救之道是困难的
+无人生还,https://book.douban.com/subject/3006581/,[英] 阿加莎・克里斯蒂 , 祁阿红 , 人民文学出版社 , 2008-3 , 19.00,9.0,"(
+ 150494人评价
+ )",童谣杀人案
+格林童话全集,https://book.douban.com/subject/1043008/,[德国]格林兄弟 , 魏以新 , 人民文学出版社 , 1994-11 , 21.45元,9.1,"(
+ 95633人评价
+ )",一本有教育意义的书
+中国少年儿童百科全书(全四册),https://book.douban.com/subject/1028409/,林崇德 主编 ,, 浙江教育出版社 , 1991-4 , 168.00元,9.4,"(
+ 18700人评价
+ )",
+鼠疫,https://book.douban.com/subject/24257229/,[法] 阿尔贝·加缪 , 刘方 , 上海译文出版社 , 2013-8 , 34.00元,9.1,"(
+ 79496人评价
+ )",用别样的监禁生活再现某种监禁生活,与用不存在的事表现真事同等合理
+西游记(全二册),https://book.douban.com/subject/1029553/,吴承恩 , 黄肃秋 注释 , 人民文学出版社 , 2004-8 , 47.20元,9.1,"(
+ 89041人评价
+ )",神魔皆有人情,精魅亦通世故
+嫌疑人X的献身,https://book.douban.com/subject/3211779/,[日] 东野圭吾 , 刘子倩 , 南海出版公司 , 2008-9 , 28.00,8.9,"(
+ 524507人评价
+ )",数学好是一种极致的浪漫
+黄金时代,https://book.douban.com/subject/1089243/,王小波 ,, 花城出版社 , 1999-3 , 19.00元,8.9,"(
+ 166901人评价
+ )",我想爱,想吃,还想在一瞬间变成天上半明半暗的云
+可能性的艺术,https://book.douban.com/subject/35819419/,刘瑜 ,, 广西师范大学出版社 , 2022-4 , 82.00元,9.2,"(
+ 40954人评价
+ )",
+傲慢与偏见,https://book.douban.com/subject/1083428/,[英] 奥斯丁 , 张玲 , 人民文学出版社 , 1993-7 , 13.00元,8.9,"(
+ 233395人评价
+ )",所有现代言情小说的母体
+史记(全十册),https://book.douban.com/subject/1077847/,司马迁 , (索隐)司马贞,(正义)张守节 , 中华书局 , 1982-11 , 125.00,9.6,"(
+ 24977人评价
+ )",史家之绝唱,无韵之离骚
+始于极限,https://book.douban.com/subject/35966120/,[日] 上野千鹤子 , 曹逸冰 , 新星出版社 , 2022-9-20 , 59,9.0,"(
+ 71582人评价
+ )",
+悲惨世界(上中下),https://book.douban.com/subject/1205054/,[法] 雨果 , 李丹 , 人民文学出版社 , 1992-6 , 66.00元,9.1,"(
+ 63925人评价
+ )",现实主义与浪漫主义的至高杰作
+台北人,https://book.douban.com/subject/5337248/,白先勇 ,, 广西师范大学出版社 , 2010-10 , 38.00元,9.0,"(
+ 73866人评价
+ )",白先勇短篇小说集
+永恒的终结,https://book.douban.com/subject/25829693/,[美] 艾萨克·阿西莫夫 , 崔正男 , 江苏凤凰文艺出版社 , 2014-9 , 32.00元,9.1,"(
+ 47492人评价
+ )",关于时间旅行的终极奥秘和恢宏构想
+诗经,https://book.douban.com/subject/1883245/,孔丘 编订 ,, 北京出版社 , 2006-7 , 19.90元,9.5,"(
+ 28840人评价
+ )",思无邪
+孽子,https://book.douban.com/subject/5337254/,白先勇 ,, 广西师范大学出版社 , 2010.10 , 46.00元,9.2,"(
+ 47750人评价
+ )",写给那一群, 在最深最深的黑夜里, 独自彷徨街头, 无所依归的孩子们
+刘擎西方现代思想讲义,https://book.douban.com/subject/35313227/,刘擎 ,, 新星出版社 , 2021-1 , 79.00元,9.2,"(
+ 36553人评价
+ )",
diff --git a/GUI.py b/GUI.py
new file mode 100644
index 0000000..797db5f
--- /dev/null
+++ b/GUI.py
@@ -0,0 +1,155 @@
+from pyecharts import options as opts
+from pyecharts.charts import Bar,Page,Line,Timeline
+from pyecharts.commons.utils import JsCode
+from pyecharts.globals import ThemeType
+from pyecharts.charts import Pie
+from pyecharts.faker import Faker
+from main import book_list_data
+from collections import Counter
+
+book_list_data_sortBystart = sorted(book_list_data, key=lambda x: x.star,reverse=True)
+book_list_data_sortBystart=book_list_data_sortBystart[:10]
+book_list_data_sortBystart=book_list_data_sortBystart[::-1]
+# 从book_list_data中提取书籍名称和评分数据
+book_names = [book.name for book in book_list_data_sortBystart]
+book_stars = [book.star for book in book_list_data_sortBystart]
+x_data = list(range(1, len(book_names) + 1))
+# 创建柱状图
+bar = (
+Bar(init_opts=opts.InitOpts(theme="shine",width="850px",height='400px'))
+ .add_xaxis(book_names)
+ .add_yaxis("评分", book_stars,color="red")
+ .reversal_axis() # 实现旋转
+ .set_global_opts(
+ title_opts=opts.TitleOpts(title="评分前10榜", pos_bottom="bottom", pos_left="center"),
+ xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=0)),
+ yaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-45))
+ )
+# .render("bar_datazoom_slider.html")
+)
+
+
+publisher_list = [book.publisher for book in book_list_data]
+# 使用Counter来计算每个出版商的数量
+publisher_counter = Counter(publisher_list)
+publisher_names = list(publisher_counter.keys())
+publisher_counts = list(publisher_counter.values())
+colors = [
+ "#5470c6", "#91cc75", "#fac858", "#ee6666", "#73c0de",
+ "#3ba272", "#fc8452", "#9a60b4", "#ea7ccc", "#bb60b4",
+ "#8B008B", "#FF1493", "#1E90FF", "#20B2AA", "#2E8B57",
+ "#B22222", "#FF4500", "#4682B4", "#DAA520", "#32CD32"
+]
+
+# 使用Pyecharts创建饼图
+pie = (
+ Pie()
+ .add("", [list(z) for z in zip(publisher_names, publisher_counts)])
+ .set_colors(colors) # 设置颜色
+ .set_global_opts(title_opts=opts.TitleOpts(title="出版商分布饼图", pos_left="center",pos_bottom="bottom"))
+ .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
+)
+
+prices = [float(book.price) for book in book_list_data ]
+print(prices)
+# 将价格分组到不同的区间
+price_intervals = [0, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, float('inf')] # 设置价格区间
+price_counts = [0] * (len(price_intervals) - 1) # 初始化每个区间的计数为0
+prices_intervalsStr=["0-50","50-100","100-150","150-200","200-250","250-300","300-350","350-400","400-450","450-500","500以上"]
+# 统计每个价格区间的数量
+for price in prices:
+ for i in range(len(price_intervals) - 1):
+ if price>=500:
+ price_counts[len(price_intervals) - 1] += 1
+ break
+ if price_intervals[i] <= price and price < price_intervals[i + 1]:
+ price_counts[i] += 1
+ break
+# print(price_counts)
+# 生成价格区间与书籍数量折线图
+line = (
+ Line()
+ .add_xaxis([str(interval) for interval in prices_intervalsStr[:-1]]) # X轴标签为价格区间
+ .add_yaxis("书籍数量", price_counts, symbol="circle", is_smooth=True, markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_="max")]))
+ .set_global_opts(title_opts=opts.TitleOpts(title="价格区间与书籍数量折线图", pos_left="center",pos_bottom="bottom"),
+ xaxis_opts=opts.AxisOpts(name="价格区间"),
+ yaxis_opts=opts.AxisOpts(name="书籍数量"),
+ datazoom_opts=opts.DataZoomOpts(range_start=0, range_end=100),)
+)
+
+# 生成时间与书籍数量折线图# 假设您的书籍数据保存在book_list_data中
+# 提取每本书的出版年份
+publish_dates = [int(book.pub_year.split('-')[0]) for book in book_list_data]
+print(publish_dates)
+# 使用Counter来计算每年出版的书的数量
+publish_year_counter = Counter(publish_dates)
+print(publish_year_counter)
+# 确保结果包含连续的年份范围,并且将缺失的年份对应的数量设为 0
+full_year_range = range(min(publish_dates), max(publish_dates) + 1)
+print(full_year_range)
+pub_year_counts = [(year, publish_year_counter[year]) for year in full_year_range]
+
+# 提取年份和对应的书籍数量
+years = [str(year) for year, count in pub_year_counts]
+counts = [count for year, count in pub_year_counts]
+print(pub_year_counts)
+#
+# # 创建动态的时间曲线图
+# line_year_count = (
+# Line()
+# .add_xaxis(xaxis_data=years)
+# .add_yaxis(
+# series_name="出版数量",
+# y_axis=counts,
+# )
+# .set_global_opts(
+# title_opts=opts.TitleOpts(title="每年出版书籍数量变化"),
+# xaxis_opts=opts.AxisOpts(name="年份"),
+# yaxis_opts=opts.AxisOpts(name="书籍数量"),
+# datazoom_opts=opts.DataZoomOpts(type_="inside"),
+# tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="cross"),
+# )
+# )
+
+# timeline = Timeline()
+# for i, year in enumerate(years):
+line_year_count = (
+Line()
+ .add_xaxis(years)
+ .add_yaxis(
+ series_name="出版书籍数量",
+ y_axis=counts,
+ markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_="max", name="最大值"),
+ opts.MarkPointItem(type_="min", name="最小值")]),
+ markline_opts=opts.MarkLineOpts(data=[opts.MarkLineItem(type_="average", name="平均值")]),
+ )
+ .set_global_opts(title_opts=opts.TitleOpts(title="年份与出版书籍数量变化"),
+ xaxis_opts=opts.AxisOpts(name="年份"),
+ yaxis_opts=opts.AxisOpts(name="书籍数量"),
+ )
+
+
+ )
+# timeline.add(line_year_count, time_point=str(year))
+# timeline.add_schema(
+# play_interval=1000, # 播放的时间间隔
+# is_auto_play=False, # 是否自动播放
+# pos_left="center", # 时间轴组件的位置
+# pos_bottom="bottom",
+# )
+
+
+
+# 生成html文件(可选)
+# pie.render("publisher_pie_chart.html")
+# 创建一个页面
+page = Page()
+# 将柱状图和饼图添加到页面中
+page.add(line_year_count)
+page.add(bar)
+page.add(pie)
+page.add(line)
+# 生成HTML文件
+page.render("book_analysis.html")
+
+
diff --git a/bar_datazoom_slider.html b/bar_datazoom_slider.html
new file mode 100644
index 0000000..74e2632
--- /dev/null
+++ b/bar_datazoom_slider.html
@@ -0,0 +1,195 @@
+
+
+
+
+ Awesome-pyecharts
+
+
+
+
+
+
+
+
+
diff --git a/book_analysis.html b/book_analysis.html
new file mode 100644
index 0000000..5980ee3
--- /dev/null
+++ b/book_analysis.html
@@ -0,0 +1,1130 @@
+
+
+
+
+ Awesome-pyecharts
+
+
+
+
+
+
+
+
+
+
+
diff --git a/book_data.xlsx b/book_data.xlsx
new file mode 100644
index 0000000..1e34d1d
Binary files /dev/null and b/book_data.xlsx differ
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..7e5cefa
--- /dev/null
+++ b/main.py
@@ -0,0 +1,161 @@
+import copy
+import re
+
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+from time import sleep
+import sql
+
+
+class Book:
+ def __init__(self, name, url, star, star_people, author, translater, publisher, pub_year, price, comment):
+ self.name = name
+ self.url = url
+ self.star = star
+ self.star_people = star_people
+ self.author = author
+ self.translater = translater
+ self.publisher = publisher
+ self.pub_year = pub_year
+ self.price = price
+ self.comment = comment
+
+ def to_dict(self):
+ return {
+ '书名': self.name,
+ '豆瓣链接': self.url,
+ '作者': self.author,
+ '译者': self.translater,
+ '出版社': self.publisher,
+ '出版日期': self.pub_year,
+ '价格': self.price,
+ '评分': self.star,
+ '评分人数': self.star_people,
+ '一句话评价': self.comment
+ }
+
+ def __str__(self):
+ return f"Book Info: {self.name} - {self.author} - {self.pub_year} - {self.publisher} - {self.price} - {self.star} - {self.star_people} - {self.comment}"
+
+
+class DoubanBookTop250Crawler:
+ def __init__(self):
+ self.book_list = []
+ self.book_list_data=[]
+
+ def get_book_info(self, url, headers):
+ res = requests.get(url, headers=headers)
+ soup = BeautifulSoup(res.text, 'html.parser')
+ for book in soup.select('.item'):
+ name = book.select('.pl2 a')[0]['title'] # 书名
+ url = book.select('.pl2 a')[0]['href'] # 书籍链接
+ star = book.select('.rating_nums')[0].text # 书籍评分
+ star_people = book.select('.pl')[1].text # 评分人数
+ # 提取其他字段
+ info = book.select('.pl')[0].text.split('/')
+ if len(info) == 5: # 正常情况
+ author = info[0]
+ translater = info[1]
+ publisher = info[2]
+ pub_year = info[3]
+ price = info[4]
+ elif len(info) == 4: # 没有译者
+ author = info[0]
+ translater = None
+ publisher = info[1]
+ pub_year = info[2]
+ price = info[3]
+ elif len(info) == 6: # 有2个价格
+ author = info[0]
+ translater = info[1]
+ publisher = info[2]
+ pub_year = info[3]
+ price = str(info[4]) + '/' + str(info[5])
+ elif len(info) == 3: # 没有作者,且没有译者
+ author = None
+ translater = None
+ publisher = info[0]
+ pub_year = info[1]
+ price = str(info[2])
+ else:
+ # 这里可以加入错误处理逻辑,比如打印错误信息
+ continue # 跳过当前循环
+ comment = book.select('.quote span')[0].text if book.select('.quote span') else None # 一句话评价
+ book_comment = book.select('.quote span')[0].text if book.select('.quote span') else None # 一句话评价
+ book_obj = Book(name, url, star, star_people, author, translater, publisher, pub_year, price, book_comment)
+ self.book_list.append(book_obj.to_dict())
+ self.book_list_data.append(book_obj)
+
+
+ def save_to_csv(self, csv_name):
+ df = pd.DataFrame(self.book_list)
+ df.to_csv(csv_name, encoding='utf_8_sig', index=False)
+
+ def crawl_douban_top250(self):
+ headers = {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
+ for i in range(3):
+ page_url = 'https://book.douban.com/top250?start={}'.format(str(i * 25))
+ print('开始爬取第{}页,地址是:{}'.format(str(i + 1), page_url))
+ self.get_book_info(page_url, headers)
+ sleep(1)
+ self.save_to_csv(csv_name="BookDouban250.csv")
+
+# 实例化爬虫对象,并调用方法执行爬取和保存数据
+crawler = DoubanBookTop250Crawler()
+crawler.crawl_douban_top250()
+book_list = crawler.book_list
+book_list_data = crawler.book_list_data
+dataexcel={"书名":[],"豆瓣链接":[],"作者":[],"译者":[],"出版社":[],"出版日期":[],"价格":[],"评分":[],"评分人数":[],"一句话评价":[]}
+for book in book_list:
+ book['评分人数']= book['评分人数'].replace('\n', '').strip()
+ book['评分人数'] = book['评分人数'].replace(' ', '').replace("(","").replace(")","").strip()
+ dataexcel['书名'].append(book['书名'])
+ dataexcel['豆瓣链接'].append(book['豆瓣链接'])
+ dataexcel['作者'].append(book['作者'])
+ dataexcel['译者'].append(book['译者'])
+ dataexcel['出版社'].append(book['出版社'])
+ dataexcel['出版日期'].append(book['出版日期'])
+ dataexcel['价格'].append(book['价格'])
+ dataexcel['评分'].append(book['评分'])
+ dataexcel['评分人数'].append(book['评分人数'])
+ dataexcel['一句话评价'].append(book['一句话评价'])
+book_list_data_two_price=[]
+for book in book_list_data:
+ book.star_people=book.star_people.replace('\n','').strip()
+ book.star_people=book.star_people.replace(' ','').replace("(","").replace(")","").strip()
+ book.star_people=''.join(filter(str.isdigit, book.star_people))
+ book.price=book.price.replace('元','').strip()
+ if book.price.find('/')!=-1:
+ copy_price=book.price.split('/')[1]
+ book.price=book.price.split('/')[0]
+ copy_book=copy.deepcopy(book)
+ copy_book.price=copy_price
+ book_list_data_two_price.append(copy_book)
+ match=re.search(r'\d+\.\d{2}',book.price)
+ if match:
+ book.price = match.group()
+book_list_data=book_list_data+book_list_data_two_price
+
+
+print(crawler.book_list)
+# 将 book_list 转换为 DataFrame
+df = pd.DataFrame(dataexcel)
+# 将 DataFrame 写入到 Excel 文件
+file_name = 'book_data.xlsx'
+df.to_excel(file_name, index=False)
+print(f"书籍数据已写入到 {file_name}")
+
+# 使用BookDatabase类
+db = sql.BookDatabase(host='localhost', user='root', password='123456', database='xiaosuo', table_name='books')
+db.initialize_table()
+db.insert_books(book_list_data)
+
+
+
+
+
+
+
+
diff --git a/publisher_pie_chart.html b/publisher_pie_chart.html
new file mode 100644
index 0000000..e8b60b6
--- /dev/null
+++ b/publisher_pie_chart.html
@@ -0,0 +1,226 @@
+
+
+
+
+ Awesome-pyecharts
+
+
+
+
+
+
+
+
diff --git a/run.py b/run.py
new file mode 100644
index 0000000..3c9950c
--- /dev/null
+++ b/run.py
@@ -0,0 +1,11 @@
+import requests
+import json
+
+
+while True:
+ msg = input('我:')
+ sess = requests.get(
+ ('https://open.drea.cc/bbsapi/chat/get?keyWord=' + msg + '&userName=type%3Dbbs'))
+ js = sess.text
+ js = json.loads(js)
+ print('微梦机器人:', js['data']['reply'])
diff --git a/sql.py b/sql.py
new file mode 100644
index 0000000..4e9228d
--- /dev/null
+++ b/sql.py
@@ -0,0 +1,69 @@
+import pymysql
+
+class BookDatabase:
+ def __init__(self, host, user, password, database, table_name):
+ self.host = host
+ self.user = user
+ self.password = password
+ self.database = database
+ self.table_name = table_name
+
+ def connect(self):
+ self.connection = pymysql.connect(host=self.host, user=self.user, password=self.password,
+ database=self.database, cursorclass=pymysql.cursors.DictCursor)
+
+ def close(self):
+ self.connection.close()
+
+ def table_exists(self):
+ with self.connection.cursor() as cursor:
+ check_table_query = f"SHOW TABLES LIKE '{self.table_name}'"
+ cursor.execute(check_table_query)
+ result = cursor.fetchone()
+ return bool(result)
+
+ def create_table(self):
+ with self.connection.cursor() as cursor:
+ create_table_query = """
+ CREATE TABLE IF NOT EXISTS books (
+ id INT AUTO_INCREMENT PRIMARY KEY,
+ name VARCHAR(255) NOT NULL,
+ url VARCHAR(255),
+ star FLOAT,
+ star_people INT,
+ author VARCHAR(255),
+ translater VARCHAR(255),
+ publisher VARCHAR(255),
+ pub_year VARCHAR(20),
+ price FLOAT,
+ comment TEXT
+ )
+ """
+ cursor.execute(create_table_query)
+ self.connection.commit()
+
+ def insert_books(self, booklist):
+ with self.connection.cursor() as cursor:
+ insert_query = """
+ INSERT INTO{table_name} (name, url, star, star_people, author, translater, publisher, pub_year, price, comment)
+ VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
+ """
+ for book in booklist:
+ cursor.execute(insert_query.format(table_name=self.table_name),
+ (book.name, book.url, float(book.star), int(book.star_people),
+ book.author, book.translater, book.publisher,
+ book.pub_year, float(book.price), book.comment))
+ self.connection.commit()
+ self.close()
+
+ def initialize_table(self):
+
+ self.connect()
+ if not self.table_exists():
+ self.create_table()
+ # self.close()
+
+
+# # 使用BookDatabase类
+# db = BookDatabase(host='localhost', user='root', password='123456', database='xiaosuo', table_name='books')
+# db.initialize_table()