From 4bdec520466a8a6cd7241b1cb761e1749eb7ea8b Mon Sep 17 00:00:00 2001 From: kun <13098278+w52020031129@user.noreply.gitee.com> Date: Mon, 27 May 2024 11:01:01 +0800 Subject: [PATCH] 5-27 --- .idea/.gitignore | 8 + .idea/inspectionProfiles/Project_Default.xml | 7 + .../inspectionProfiles/profiles_settings.xml | 6 + .idea/misc.xml | 7 + .idea/modules.xml | 8 + .idea/vcs.xml | 6 + .idea/豆瓣读书250爬虫_20220926.iml | 8 + 11_DouBan250Spider.py | 99 ++ BookDouban250.csv | 226 ++++ GUI.py | 155 +++ bar_datazoom_slider.html | 195 +++ book_analysis.html | 1130 +++++++++++++++++ book_data.xlsx | Bin 0 -> 14934 bytes main.py | 161 +++ publisher_pie_chart.html | 226 ++++ run.py | 11 + sql.py | 69 + 17 files changed, 2322 insertions(+) create mode 100644 .idea/.gitignore create mode 100644 .idea/inspectionProfiles/Project_Default.xml create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml create mode 100644 .idea/豆瓣读书250爬虫_20220926.iml create mode 100644 11_DouBan250Spider.py create mode 100644 BookDouban250.csv create mode 100644 GUI.py create mode 100644 bar_datazoom_slider.html create mode 100644 book_analysis.html create mode 100644 book_data.xlsx create mode 100644 main.py create mode 100644 publisher_pie_chart.html create mode 100644 run.py create mode 100644 sql.py diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..35410ca --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# 默认忽略的文件 +/shelf/ +/workspace.xml +# 基于编辑器的 HTTP 客户端请求 +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..7c4836f --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,7 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..a44514f --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..31c3df9 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/豆瓣读书250爬虫_20220926.iml b/.idea/豆瓣读书250爬虫_20220926.iml new file mode 100644 index 0000000..36928bc --- /dev/null +++ b/.idea/豆瓣读书250爬虫_20220926.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/11_DouBan250Spider.py b/11_DouBan250Spider.py new file mode 100644 index 0000000..545babe --- /dev/null +++ b/11_DouBan250Spider.py @@ -0,0 +1,99 @@ + +import requests # 发送请求 +from bs4 import BeautifulSoup # 解析网页 +import pandas as pd # 存取csv +from time import sleep # 等待时间 + +book_name = [] # 书名 +book_url = [] # 书籍链接 +book_star = [] # 书籍评分 +book_star_people = [] # 评分人数 +book_author = [] # 书籍作者 +book_translater = [] # 书籍译者 +book_publisher = [] # 出版社 +book_pub_year = [] # 出版日期 +book_price = [] # 书籍价格 +book_comment = [] # 一句话评价 + + +def get_book_info(url, headers): + res = requests.get(url, headers=headers) + soup = BeautifulSoup(res.text, 'html.parser') + for book in soup.select('.item'): + name = book.select('.pl2 a')[0]['title'] # 书名 + book_name.append(name) + bkurl = book.select('.pl2 a')[0]['href'] # 书籍链接 + book_url.append(bkurl) + star = book.select('.rating_nums')[0].text # 书籍评分 + book_star.append(star) + star_people = book.select('.pl')[1].text # 评分人数 + star_people = star_people.strip().replace(' ', '').replace('人评价', '').replace('(\n', '').replace('\n)', + '') # 数据清洗 + book_star_people.append(star_people) + + # 没有一句话评价,比如倒数第二名,君主论 + if book.select('.quote span'): + book_comment.append(book.select('.quote span')[0].text) + else: + book_comment.append(None) + + info = book.select('.pl')[0].text.split('/') + if len(info) == 5: # 正常情况 + book_author.append(info[0]) + book_translater.append(info[1]) + book_publisher.append(info[2]) + book_pub_year.append(info[3]) + book_price.append(str(info[4])) + elif len(info) == 4: # 没有译者,比如:第一名,红楼梦 + book_author.append(info[0]) + book_translater.append(None) + book_publisher.append(info[1]) + book_pub_year.append(info[2]) + book_price.append(str(info[3])) + elif len(info) == 6: # 有2个价格,比如:第一页,福尔摩斯探案全集(上中下) + book_author.append(info[0]) + book_translater.append(info[1]) + book_publisher.append(info[2]) + book_pub_year.append(info[3]) + book_price.append(str(info[4]) + '/' + str(info[5])) + elif len(info) == 3: # 没有作者,且没有译者,比如:第5页,十万个为什么 + book_author.append(None) + book_translater.append(None) + book_publisher.append(info[0]) + book_pub_year.append(info[1]) + book_price.append(str(info[2])) + else: + pass + + +def save_to_csv(csv_name): + """ + 数据保存到csv + :return: None + """ + df = pd.DataFrame() # 初始化一个DataFrame对象 + df['书名'] = book_name + df['豆瓣链接'] = book_url + df['作者'] = book_author + df['译者'] = book_translater + df['出版社'] = book_publisher + df['出版日期'] = book_pub_year + df['价格'] = book_price + df['评分'] = book_star + df['评分人数'] = book_star_people + df['一句话评价'] = book_comment + df.to_csv(csv_name, encoding='utf_8_sig') # 将数据保存到csv文件 + + +if __name__ == "__main__": + # 定义一个请求头 + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'} + # 开始爬取豆瓣数据 + for i in range(1): # 爬取共10页,每页25条数据 + page_url = 'https://book.douban.com/top250?start={}'.format(str(i * 25)) + print('开始爬取第{}页,地址是:{}'.format(str(i + 1), page_url)) + get_book_info(page_url, headers) + sleep(1) # 等待1秒 + # 保存到csv文件 + save_to_csv(csv_name="BookDouban250.csv") diff --git a/BookDouban250.csv b/BookDouban250.csv new file mode 100644 index 0000000..26f7d86 --- /dev/null +++ b/BookDouban250.csv @@ -0,0 +1,226 @@ +书名,豆瓣链接,作者,译者,出版社,出版日期,价格,评分,评分人数,一句话评价 +红楼梦,https://book.douban.com/subject/1007305/,[清] 曹雪芹 著 ,, 人民文学出版社 , 1996-12 , 59.70元,9.6,"( + 424383人评价 + )",都云作者痴,谁解其中味? +活着,https://book.douban.com/subject/4913064/,余华 ,, 作家出版社 , 2012-8 , 20.00元,9.4,"( + 834408人评价 + )",生的苦难与伟大 +1984,https://book.douban.com/subject/4820710/,[英] 乔治·奥威尔 , 刘绍铭 , 北京十月文艺出版社 , 2010-4-1 , 28.00,9.4,"( + 284757人评价 + )",栗树荫下,我出卖你,你出卖我 +哈利·波特,https://book.douban.com/subject/24531956/,J.K.罗琳 (J.K.Rowling) , 苏农 , 人民文学出版社 , 2008-12-1 , 498.00元,9.7,"( + 107448人评价 + )",从9¾站台开始的旅程 +三体全集,https://book.douban.com/subject/6518605/,刘慈欣 ,, 重庆出版社 , 2012-1 , 168.00元,9.5,"( + 187889人评价 + )",地球往事三部曲 +百年孤独,https://book.douban.com/subject/6082808/,[哥伦比亚] 加西亚·马尔克斯 , 范晔 , 南海出版公司 , 2011-6 , 39.50元,9.3,"( + 426035人评价 + )",魔幻现实主义文学代表作 +飘,https://book.douban.com/subject/1068920/,[美国] 玛格丽特·米切尔 , 李美华 , 译林出版社 , 2000-9 , 40.00元,9.3,"( + 212637人评价 + )",革命时期的爱情,随风而逝 +动物农场,https://book.douban.com/subject/2035179/,[英] 乔治·奥威尔 , 荣如德 , 上海译文出版社 , 2007-3 , 10.00元,9.3,"( + 160188人评价 + )",太阳底下并无新事 +房思琪的初恋乐园,https://book.douban.com/subject/27614904/,林奕含 ,, 北京联合出版公司 , 2018-2 , 45.00元,9.2,"( + 380002人评价 + )",向死而生的文学绝唱 +三国演义(全二册),https://book.douban.com/subject/1019568/,[明] 罗贯中 ,, 人民文学出版社 , 1998-05 , 39.50元,9.3,"( + 168645人评价 + )",是非成败转头空 +福尔摩斯探案全集(上中下),https://book.douban.com/subject/1040211/,[英] 阿·柯南道尔 , 丁钟华 等 , 群众出版社 , 1981-8 , 53.00元/68.00元,9.3,"( + 133712人评价 + )",名侦探的代名词 +白夜行,https://book.douban.com/subject/10554308/,[日] 东野圭吾 , 刘姿君 , 南海出版公司 , 2013-1-1 , 39.50元,9.2,"( + 475686人评价 + )",一宗离奇命案牵出跨度近20年步步惊心的故事 +小王子,https://book.douban.com/subject/1084336/,[法] 圣埃克苏佩里 , 马振骋 , 人民文学出版社 , 2003-8 , 22.00元,9.1,"( + 756658人评价 + )",献给长成了大人的孩子们 +安徒生童话故事集,https://book.douban.com/subject/1046209/,(丹麦)安徒生 , 叶君健 , 人民文学出版社 , 1997-08 , 25.00元,9.3,"( + 132841人评价 + )",为了争取未来的一代 +天龙八部,https://book.douban.com/subject/1255625/,金庸 ,, 生活·读书·新知三联书店 , 1994-5 , 96.00元,9.2,"( + 133043人评价 + )",有情皆孽,无人不冤 +呐喊,https://book.douban.com/subject/1449351/,鲁迅 ,, 人民文学出版社 , 1973-3 , 0.36元,9.2,"( + 160624人评价 + )",新文学的第一声呐喊 +撒哈拉的故事,https://book.douban.com/subject/1060068/,三毛 ,, 哈尔滨出版社 , 2003-8 , 15.80元,9.2,"( + 175143人评价 + )",游荡的自由灵魂 +悉达多,https://book.douban.com/subject/26980487/,[德] 赫尔曼·黑塞 , 姜乙 , 天津人民出版社 , 2017-1 , 32.00元,9.3,"( + 105727人评价 + )", +邓小平时代,https://book.douban.com/subject/20424526/,【美】傅高义 (Ezra.F.Vogel) , 冯克利 , 生活·读书·新知三联书店 , 2013-1-18 , 88.00元,9.3,"( + 70891人评价 + )",个人命运背后的历史变局 +杀死一只知更鸟,https://book.douban.com/subject/6781808/,[美] 哈珀·李 , 高红梅 , 译林出版社 , 2012-9 , 32.00元,9.2,"( + 146983人评价 + )",有一种东西不能遵循从众原则,那就是——人的良心 +失踪的孩子,https://book.douban.com/subject/30172069/,[意] 埃莱娜·费兰特 , 陈英 , 人民文学出版社 , 2018-7 , 62.00元,9.2,"( + 81644人评价 + )",我的整个生命,只是一场为了提升社会地位的低俗斗争。 +明朝那些事儿(1-9),https://book.douban.com/subject/3674537/,当年明月 ,, 中国海关出版社 , 2009-4 , 358.20元,9.2,"( + 175139人评价 + )",不拘一格的历史书写 +新名字的故事,https://book.douban.com/subject/26986954/,[意] 埃莱娜·费兰特 , 陈英 , 人民文学出版社 , 2017-4 , 59.00元,9.2,"( + 92225人评价 + )",探索青年时代的激情、困惑、挣扎、背叛和失去 +野草,https://book.douban.com/subject/1915958/,鲁迅 ,, 人民文学出版社 , 1973-3 , 0.20元,9.5,"( + 47503人评价 + )",我以这一丛野草,在明与暗,生与死,过去与未来之际,献于友与仇,人与兽,爱者与不爱者之前作证。 +沉默的大多数,https://book.douban.com/subject/1054685/,王小波 ,, 中国青年出版社 , 1997-10 , 27.00元,9.1,"( + 151597人评价 + )",沉默是沉默者的通行证 +中国历代政治得失,https://book.douban.com/subject/1003479/,钱穆 ,, 生活·读书·新知三联书店 , 2001 , 12.00元,9.2,"( + 72960人评价 + )",一部简明的“中国政治制度史” +局外人,https://book.douban.com/subject/4908885/,[法] 阿尔贝·加缪 , 柳鸣九 , 上海译文出版社 , 2010-8 , 22.00元,9.1,"( + 240586人评价 + )",人生在世,永远也不该演戏作假 +乡土中国,https://book.douban.com/subject/1795079/,费孝通 ,, 上海人民出版社 , 2006-04-01 , 38.00,9.2,"( + 90709人评价 + )",中国乡土社会传统文化和社会结构理论研究代表作 +白鹿原,https://book.douban.com/subject/10564071/,陈忠实 ,, 人民文学出版社 , 2012-9 , 39.00元,9.2,"( + 108128人评价 + )",一轴关于我们民族灵魂的现实主义画卷 +卡拉马佐夫兄弟,https://book.douban.com/subject/25887924/,[俄] 费奥多尔·陀思妥耶夫斯基 , 荣如德 , 上海译文出版社 , 2015-2-1 , CNY 78.00,9.6,"( + 34343人评价 + )", +人类简史,https://book.douban.com/subject/25985021/,[以色列] 尤瓦尔·赫拉利 , 林俊宏 , 中信出版社 , 2014-11 , 68.00元,9.1,"( + 199868人评价 + )",跟着人类一同走过十万年 +围城,https://book.douban.com/subject/1008145/,钱锺书 ,, 人民文学出版社 , 1991-2 , 19.00,9.0,"( + 447368人评价 + )",幽默的语言和对生活深刻的观察 +彷徨,https://book.douban.com/subject/1449348/,鲁迅 ,, 人民文学出版社 , 1973-3 , 0.37元,9.3,"( + 65707人评价 + )",路漫漫其修远兮,吾将上下而求索 +平凡的世界(全三部),https://book.douban.com/subject/1200840/,路遥 ,, 人民文学出版社 , 2005-1 , 64.00元,9.0,"( + 324539人评价 + )",中国当代城乡生活全景 +罗杰疑案,https://book.douban.com/subject/21371175/,[英] 阿加莎·克里斯蒂 , 常禾 , 新星出版社 , 2013-3 , 28.00元,9.2,"( + 74837人评价 + )", +许三观卖血记,https://book.douban.com/subject/4760224/,余华 ,, 作家出版社 , 2012-9 , 24.00元,9.2,"( + 164226人评价 + )", +我与地坛,https://book.douban.com/subject/1209899/,史铁生 ,, 春风文艺出版社 , 2002-5 , 25.00元,9.2,"( + 109004人评价 + )",这是你的罪孽与福祉 +笑傲江湖(全四册),https://book.douban.com/subject/1002299/,金庸 ,, 生活·读书·新知三联书店 , 1994-5 , 76.80元,9.1,"( + 111909人评价 + )",欲练此功,必先自宫 +献给阿尔吉侬的花束,https://book.douban.com/subject/26362836/,[美] 丹尼尔·凯斯 , 陈澄和 , 广西师范大学出版社 , 2015-4 , 36.00元,9.1,"( + 109249人评价 + )",当声称能改造智能的科学实验选中心智障碍主角 +东方快车谋杀案,https://book.douban.com/subject/1827374/,[英] 阿加莎·克里斯蒂 , 陈尧光 , 人民文学出版社 , 2006-5 , 18.00元,9.1,"( + 139814人评价 + )",谋杀诡计惊人,波洛的抉择耐人寻味 +肖申克的救赎,https://book.douban.com/subject/1829226/,[美] 斯蒂芬·金 , 施寄青 , 人民文学出版社 , 2006-7 , 29.90元,9.1,"( + 121642人评价 + )",豆瓣电影Top1原著 +江城,https://book.douban.com/subject/7060185/,[美] 彼得·海斯勒 , 李雪顺 , 上海译文出版社 , 2012-1 , 36.00元,9.1,"( + 63040人评价 + )",外国人眼中的涪陵 +基督山伯爵,https://book.douban.com/subject/1085860/,[法国] 大仲马 , 周克希 , 上海译文出版社 , 1991-12-1 , 43.90元,9.1,"( + 129540人评价 + )",一个报恩复仇的故事,以法国波旁王朝和七月王朝为背景 +城南旧事,https://book.douban.com/subject/1254588/,林海音 文 ,, 中国青年出版社 , 2003-7 , 16.00元,9.1,"( + 157423人评价 + )",长亭外,古道边,芳草碧连天 +霍乱时期的爱情,https://book.douban.com/subject/10594787/,[哥伦比亚] 加西亚·马尔克斯 , 杨玲 , 南海出版公司 , 2012-9-1 , 39.50元,9.0,"( + 286118人评价 + )",义无反顾地直达爱情的核心 +故事新编,https://book.douban.com/subject/2046909/,鲁迅 ,, 人民文学出版社 , 1973-12-01 , 0.31 元,9.4,"( + 43436人评价 + )",拾取古代传说,取一点因由,随意点染 +艺术的故事,https://book.douban.com/subject/3162991/,[英] 贡布里希 (Sir E.H.Gombrich) , 范景中 , 广西美术出版社 , 2008-04 , 280.00,9.6,"( + 26388人评价 + )",从最早的洞窟绘画到当今的实验艺术 +万历十五年,https://book.douban.com/subject/1041482/,[美] 黄仁宇 ,, 生活·读书·新知三联书店 , 1997-5 , 18.00元,9.0,"( + 202254人评价 + )",见微知著,历史观的颠覆 +朝花夕拾,https://book.douban.com/subject/1449352/,鲁迅 ,, 人民文学出版社 , 1973-4 , 0.25元,9.1,"( + 155213人评价 + )",在纷扰中寻出一点闲静 +月亮和六便士,https://book.douban.com/subject/1858513/,[英] 毛姆 , 傅惟慈 , 上海译文出版社 , 2006-8 , 15.00元,9.0,"( + 209821人评价 + )",有多少人会经历顿悟,就有更少的人甘愿自我放逐 +厌女,https://book.douban.com/subject/25836270/,上野千鹤子 , 王兰 , 上海三联书店 , 2015-1 , 28.00,9.1,"( + 82066人评价 + )", +秋园,https://book.douban.com/subject/34998019/,杨本芬 ,, 北京联合出版公司 , 2020-6 , 38.00元,9.0,"( + 108614人评价 + )", +射雕英雄传,https://book.douban.com/subject/1044547/,金庸 ,, 生活·读书·新知三联书店 , 1999-04 , 47.00元,9.1,"( + 85813人评价 + )",侠之大者,为国为民 +置身事内,https://book.douban.com/subject/35546622/,兰小欢 ,, 上海人民出版社 , 2021-8 , 65.00元,9.1,"( + 80263人评价 + )", +追风筝的人,https://book.douban.com/subject/1770782/,[美] 卡勒德·胡赛尼 , 李继宏 , 上海人民出版社 , 2006-5 , 29.00元,8.9,"( + 795265人评价 + )",为你,千千万万遍 +树上的男爵,https://book.douban.com/subject/6789605/,[意大利]伊塔洛·卡尔维诺 , 吴正仪 , 译林出版社 , 2012-4-1 , 30.00元,9.1,"( + 60492人评价 + )",是不是真的只有先与人疏离,才能最终与他们在一起? +寻路中国,https://book.douban.com/subject/5414391/,[美] 彼得·海斯勒 , 李雪顺 , 上海译文出版社 , 2011-1 , 33.00元,9.0,"( + 56992人评价 + )",《纽约客》驻北京记者驾车漫游中国大陆的经历 +刀锋,https://book.douban.com/subject/2035162/,[英]毛姆 , 周煦良 , 上海译文出版社 , 2007-3 , 18.00元,9.0,"( + 92048人评价 + )",一把刀的锋刃不容易越过;因此智者说得救之道是困难的 +无人生还,https://book.douban.com/subject/3006581/,[英] 阿加莎・克里斯蒂 , 祁阿红 , 人民文学出版社 , 2008-3 , 19.00,9.0,"( + 150494人评价 + )",童谣杀人案 +格林童话全集,https://book.douban.com/subject/1043008/,[德国]格林兄弟 , 魏以新 , 人民文学出版社 , 1994-11 , 21.45元,9.1,"( + 95633人评价 + )",一本有教育意义的书 +中国少年儿童百科全书(全四册),https://book.douban.com/subject/1028409/,林崇德 主编 ,, 浙江教育出版社 , 1991-4 , 168.00元,9.4,"( + 18700人评价 + )", +鼠疫,https://book.douban.com/subject/24257229/,[法] 阿尔贝·加缪 , 刘方 , 上海译文出版社 , 2013-8 , 34.00元,9.1,"( + 79496人评价 + )",用别样的监禁生活再现某种监禁生活,与用不存在的事表现真事同等合理 +西游记(全二册),https://book.douban.com/subject/1029553/,吴承恩 , 黄肃秋 注释 , 人民文学出版社 , 2004-8 , 47.20元,9.1,"( + 89041人评价 + )",神魔皆有人情,精魅亦通世故 +嫌疑人X的献身,https://book.douban.com/subject/3211779/,[日] 东野圭吾 , 刘子倩 , 南海出版公司 , 2008-9 , 28.00,8.9,"( + 524507人评价 + )",数学好是一种极致的浪漫 +黄金时代,https://book.douban.com/subject/1089243/,王小波 ,, 花城出版社 , 1999-3 , 19.00元,8.9,"( + 166901人评价 + )",我想爱,想吃,还想在一瞬间变成天上半明半暗的云 +可能性的艺术,https://book.douban.com/subject/35819419/,刘瑜 ,, 广西师范大学出版社 , 2022-4 , 82.00元,9.2,"( + 40954人评价 + )", +傲慢与偏见,https://book.douban.com/subject/1083428/,[英] 奥斯丁 , 张玲 , 人民文学出版社 , 1993-7 , 13.00元,8.9,"( + 233395人评价 + )",所有现代言情小说的母体 +史记(全十册),https://book.douban.com/subject/1077847/,司马迁 , (索隐)司马贞,(正义)张守节 , 中华书局 , 1982-11 , 125.00,9.6,"( + 24977人评价 + )",史家之绝唱,无韵之离骚 +始于极限,https://book.douban.com/subject/35966120/,[日] 上野千鹤子 , 曹逸冰 , 新星出版社 , 2022-9-20 , 59,9.0,"( + 71582人评价 + )", +悲惨世界(上中下),https://book.douban.com/subject/1205054/,[法] 雨果 , 李丹 , 人民文学出版社 , 1992-6 , 66.00元,9.1,"( + 63925人评价 + )",现实主义与浪漫主义的至高杰作 +台北人,https://book.douban.com/subject/5337248/,白先勇 ,, 广西师范大学出版社 , 2010-10 , 38.00元,9.0,"( + 73866人评价 + )",白先勇短篇小说集 +永恒的终结,https://book.douban.com/subject/25829693/,[美] 艾萨克·阿西莫夫 , 崔正男 , 江苏凤凰文艺出版社 , 2014-9 , 32.00元,9.1,"( + 47492人评价 + )",关于时间旅行的终极奥秘和恢宏构想 +诗经,https://book.douban.com/subject/1883245/,孔丘 编订 ,, 北京出版社 , 2006-7 , 19.90元,9.5,"( + 28840人评价 + )",思无邪 +孽子,https://book.douban.com/subject/5337254/,白先勇 ,, 广西师范大学出版社 , 2010.10 , 46.00元,9.2,"( + 47750人评价 + )",写给那一群, 在最深最深的黑夜里, 独自彷徨街头, 无所依归的孩子们 +刘擎西方现代思想讲义,https://book.douban.com/subject/35313227/,刘擎 ,, 新星出版社 , 2021-1 , 79.00元,9.2,"( + 36553人评价 + )", diff --git a/GUI.py b/GUI.py new file mode 100644 index 0000000..797db5f --- /dev/null +++ b/GUI.py @@ -0,0 +1,155 @@ +from pyecharts import options as opts +from pyecharts.charts import Bar,Page,Line,Timeline +from pyecharts.commons.utils import JsCode +from pyecharts.globals import ThemeType +from pyecharts.charts import Pie +from pyecharts.faker import Faker +from main import book_list_data +from collections import Counter + +book_list_data_sortBystart = sorted(book_list_data, key=lambda x: x.star,reverse=True) +book_list_data_sortBystart=book_list_data_sortBystart[:10] +book_list_data_sortBystart=book_list_data_sortBystart[::-1] +# 从book_list_data中提取书籍名称和评分数据 +book_names = [book.name for book in book_list_data_sortBystart] +book_stars = [book.star for book in book_list_data_sortBystart] +x_data = list(range(1, len(book_names) + 1)) +# 创建柱状图 +bar = ( +Bar(init_opts=opts.InitOpts(theme="shine",width="850px",height='400px')) + .add_xaxis(book_names) + .add_yaxis("评分", book_stars,color="red") + .reversal_axis() # 实现旋转 + .set_global_opts( + title_opts=opts.TitleOpts(title="评分前10榜", pos_bottom="bottom", pos_left="center"), + xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=0)), + yaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-45)) + ) +# .render("bar_datazoom_slider.html") +) + + +publisher_list = [book.publisher for book in book_list_data] +# 使用Counter来计算每个出版商的数量 +publisher_counter = Counter(publisher_list) +publisher_names = list(publisher_counter.keys()) +publisher_counts = list(publisher_counter.values()) +colors = [ + "#5470c6", "#91cc75", "#fac858", "#ee6666", "#73c0de", + "#3ba272", "#fc8452", "#9a60b4", "#ea7ccc", "#bb60b4", + "#8B008B", "#FF1493", "#1E90FF", "#20B2AA", "#2E8B57", + "#B22222", "#FF4500", "#4682B4", "#DAA520", "#32CD32" +] + +# 使用Pyecharts创建饼图 +pie = ( + Pie() + .add("", [list(z) for z in zip(publisher_names, publisher_counts)]) + .set_colors(colors) # 设置颜色 + .set_global_opts(title_opts=opts.TitleOpts(title="出版商分布饼图", pos_left="center",pos_bottom="bottom")) + .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}")) +) + +prices = [float(book.price) for book in book_list_data ] +print(prices) +# 将价格分组到不同的区间 +price_intervals = [0, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, float('inf')] # 设置价格区间 +price_counts = [0] * (len(price_intervals) - 1) # 初始化每个区间的计数为0 +prices_intervalsStr=["0-50","50-100","100-150","150-200","200-250","250-300","300-350","350-400","400-450","450-500","500以上"] +# 统计每个价格区间的数量 +for price in prices: + for i in range(len(price_intervals) - 1): + if price>=500: + price_counts[len(price_intervals) - 1] += 1 + break + if price_intervals[i] <= price and price < price_intervals[i + 1]: + price_counts[i] += 1 + break +# print(price_counts) +# 生成价格区间与书籍数量折线图 +line = ( + Line() + .add_xaxis([str(interval) for interval in prices_intervalsStr[:-1]]) # X轴标签为价格区间 + .add_yaxis("书籍数量", price_counts, symbol="circle", is_smooth=True, markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_="max")])) + .set_global_opts(title_opts=opts.TitleOpts(title="价格区间与书籍数量折线图", pos_left="center",pos_bottom="bottom"), + xaxis_opts=opts.AxisOpts(name="价格区间"), + yaxis_opts=opts.AxisOpts(name="书籍数量"), + datazoom_opts=opts.DataZoomOpts(range_start=0, range_end=100),) +) + +# 生成时间与书籍数量折线图# 假设您的书籍数据保存在book_list_data中 +# 提取每本书的出版年份 +publish_dates = [int(book.pub_year.split('-')[0]) for book in book_list_data] +print(publish_dates) +# 使用Counter来计算每年出版的书的数量 +publish_year_counter = Counter(publish_dates) +print(publish_year_counter) +# 确保结果包含连续的年份范围,并且将缺失的年份对应的数量设为 0 +full_year_range = range(min(publish_dates), max(publish_dates) + 1) +print(full_year_range) +pub_year_counts = [(year, publish_year_counter[year]) for year in full_year_range] + +# 提取年份和对应的书籍数量 +years = [str(year) for year, count in pub_year_counts] +counts = [count for year, count in pub_year_counts] +print(pub_year_counts) +# +# # 创建动态的时间曲线图 +# line_year_count = ( +# Line() +# .add_xaxis(xaxis_data=years) +# .add_yaxis( +# series_name="出版数量", +# y_axis=counts, +# ) +# .set_global_opts( +# title_opts=opts.TitleOpts(title="每年出版书籍数量变化"), +# xaxis_opts=opts.AxisOpts(name="年份"), +# yaxis_opts=opts.AxisOpts(name="书籍数量"), +# datazoom_opts=opts.DataZoomOpts(type_="inside"), +# tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="cross"), +# ) +# ) + +# timeline = Timeline() +# for i, year in enumerate(years): +line_year_count = ( +Line() + .add_xaxis(years) + .add_yaxis( + series_name="出版书籍数量", + y_axis=counts, + markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_="max", name="最大值"), + opts.MarkPointItem(type_="min", name="最小值")]), + markline_opts=opts.MarkLineOpts(data=[opts.MarkLineItem(type_="average", name="平均值")]), + ) + .set_global_opts(title_opts=opts.TitleOpts(title="年份与出版书籍数量变化"), + xaxis_opts=opts.AxisOpts(name="年份"), + yaxis_opts=opts.AxisOpts(name="书籍数量"), + ) + + + ) +# timeline.add(line_year_count, time_point=str(year)) +# timeline.add_schema( +# play_interval=1000, # 播放的时间间隔 +# is_auto_play=False, # 是否自动播放 +# pos_left="center", # 时间轴组件的位置 +# pos_bottom="bottom", +# ) + + + +# 生成html文件(可选) +# pie.render("publisher_pie_chart.html") +# 创建一个页面 +page = Page() +# 将柱状图和饼图添加到页面中 +page.add(line_year_count) +page.add(bar) +page.add(pie) +page.add(line) +# 生成HTML文件 +page.render("book_analysis.html") + + diff --git a/bar_datazoom_slider.html b/bar_datazoom_slider.html new file mode 100644 index 0000000..74e2632 --- /dev/null +++ b/bar_datazoom_slider.html @@ -0,0 +1,195 @@ + + + + + Awesome-pyecharts + + + + + +
+ + + diff --git a/book_analysis.html b/book_analysis.html new file mode 100644 index 0000000..5980ee3 --- /dev/null +++ b/book_analysis.html @@ -0,0 +1,1130 @@ + + + + + Awesome-pyecharts + + + + + + + +
+
+ +
+ +
+ +
+ +
+ + + diff --git a/book_data.xlsx b/book_data.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..1e34d1db1c8fa69e9b3ac8b74d123ad15a2626a6 GIT binary patch literal 14934 zcmZ|01yCK`(k_e#ch}(Vy0IX^VdDgMcXtWy1b26LcXzjqI{|`2aLLbmPJRFRPVT*X zYG!Jx_S4g=tGicES3e3ekWiRlU|_Idy~WMy;+lPk0Uu9wA1CU^X>4bt;Am&>#B5-1 z&*W}nEf*t)(9eSU^^f$)rmgDLXV%Y)1v_J2lH6Re#-gsluinaiKi2uTSfbjo#?=SO z-8e%AOMKDBB*2;x#0Nwd%J3v8@h#Meg{D0sF;>k6DQXEuAoa0i*{3sJiO!mMj|e+C z_h?TlT7pwEol`7XX&zIoZexN$og~e6PX^FE3D3|9|D|t+!%N%6ho1BhU|?wfTVEqP zN0WaPo{H}S4zM7G=s_QD@lJ6eE2bcXVMt2KLO4XC+eMnE)391T+&3DW2JL|J48Pw# zH5@E|wGin)!X%?|Otll=zSBr=HIc}t0j~$-gW9su-xfDQ4>;v!oWGY&g$jO#own;E zx2TXBK&;#oF)nWsPY+5L;oYy&xVmO{fi^|b4<+^*pIt;^;gpXmsk7RJPc;Y{JGz8e z3sx`pGnDzX0sxq{6owd-hN_Oq@o`+;|9(_juF}qtdVnd>Tb1G!+o+#704;64UHH@X z4YluONQdc=ADA)%{Bm^s9(96!YHcsy>#oHVnyyZn7IFUDIBD9MG09 zakpl6HaD>``S+FeAM>1RYB{a(VESIx%(|Q1+aL^c`8PLWI9a%^H>L%h14)9>)T6CE zjN^j*dL+0|Cz6rQl`z)# z(Pc0fnpIKMwA1_Ccu2zjh&7=SZBhV^@lBYjEv)h8$$IE@ZA;K(=7g7s&EJ-VK>f4d zv|Y&oP%~^-rGK#3((H?x)yUkuXJiTOx-LZ}_Zj_m%d-_|{a!f=fR7PH1H_iNR0#pTh;@(T|>+ou`knefqC z@(WgOn+q}F!&S5HNZr@+p##<7!xgj6rxsMyZ`D?XC)nm9;`^tb=vYmMLJOb4)a_lD zC+8uPb!nen67g4wIyQy1uhcI9;b`x}Lr}{W^LWV|;*MFd(8yrigLAS9-ZjF_5j*|y z^V+e?Gqg<#_^|1t{&x}qq(d7uZ+9sjV}eO17EHC-_q!zZr>Ia;?tH%t3aPBYDT> zy}Hl6yo4<|sI0ev43sENA(k`B*nWV<(= zr==TqbUQlip~5a*;9H)$)LPIZsyO~$rqfTSIqR2CMdEY!s+}7JDe73@DY)Al<(v#d z%kD|N*HxMPL>TB(i3l*lAt@A8=w$$3IZ2Fn`l7Y>%mtyVL}rR->9`*j4df-xR8z<7 z_E$?Nw0K?akN-|b#-jBQhwwpNt^wB-q-L2@9?BtDO3VfieceVZ zcAaA~fo7~5&Zy{1^OWam#@THn{ z4aC7jWAx2&YIH=UCV_SZkkfniW-ge0eg3>k5mb||!AOoBE92iFOwsxpIetYq2zP9( zMF}^dvS7j;fu1cwV0)r`C!GX3tZAcZ<<~kmsmXH|WSDv4*E?#QLSPqdag_PhoY>t& z#UuT@Zqgn|Jb}n(#vf1A?3PQ!`Esx0dO%*pqMRbyiZ9X)Y15vy!ON4K8rKHDpcHF2 z@N_cdj=u+3g=+Ij{bqBtGB@{f)TdZmaLJ!cG!k}`PkNCTk~%XW8tUPtmui+YtiXY3 zV_iwXAl{5KtqX==Mq^i%VH4I4~o9`_fC3b9ol%lP-KNr8hUTo52 zTSM!HXG5k0=(kA4Fsm5fGO6oRC7Myyx~c896llZ(&b(_>jpiVt0kIL#C5;ksily^! z@szud=%of#d3wayo6%EnJNJK%H~^Jf(OsKSAlTYE0A-#gtY*bG+l+{C3|e6t=zDTH z=SxQ!nbn?TuFolMefl?V_G1z#hkQqRzjZ{beeXB-pKTNPBxN%_hS0#B#URL&2w|{s z*bWpf;$vYjz^4+&zFt8D7#9eFE%`qAMztx7BM@2*K-nDpMW!-BkKA1D>{7jMRVD zbr%I1aRkI~3(|Io8$z8jJX`z4D*vc26F;$S1MJs%w;*+YVlapn`u6!mM0`i9Fzjxx zk$x8_KUnI^P@y`OMxsecyyI$HvJ6TrR9uu)05~cASdILQ^UU-GV4Olpi&Ab|g&iN1N*pBt7mo&%3(=H8nL~eE&S(Z%&O=bfC0c?lOPb z+U)7@_^x-Ur{ni(@8@+Z`Zn9q0unE?f4lfO^+L^@4f^vk@D_RG_C=u6=l1;`P?dv?%&~Ub*3X>MJz_Ti@Lh-d)~qu1vZ+p4?wDI$Hj`{|pTU>G4a+ zGx@f@UH>d;LGkN+dCJgw7V&NCtFa}v)#dN$d3oCF_MXF;>0$m;Y6YSO|JLK7N4$_j ze6d?&r{5NeBmW1P*j#01r{tbj(98D?tVQMB^-VzP(NR9&PU9OKfB@l(&HQxYcGou# zifitd2V)nSc-bgdirtDguZY!Yiq4Im&`gmxwPBFg+<0+ZVNFrItil*5hYKZV#_oZ# zf4v-C#E~3sJM+q0$;XWDXt(7#OIGBmtgFIKAT)1q`%-GL&Aa~WD~^%Jo*?ZX>n^3I zWvww#?RJWsU9z0r<2n}*B$lsq=PEjB*?JDZI~;t5l~C(@UqW&F;^TJv{_CZ~4MgJi ze9;5;3c?37N$MfC@fJ4)Ao{&s4@^C@=<)k6TDlFUKC*A<%=dkHxzD#v&ZTb+ zV0^Ht3hdEN^earw(TNgr?9|+T&*y7X@7lr*7y)HYyy04%-2K>LYp<#?kx9{UD4UX; zw=;?N?v5zmuuvKx-zB(+KB{05fpu$RpQc(TS+)K{#zwWyX-R~pvnI&LkGESg-}Xw( z&+k4MRk04`J9GCOR3k_G+KJ2yI;~n}XIO=nN&XUIi0qEBg-H5`@?li~v8}QS-ZqR> z{O4oQKE{BZtsL(Dfx6*HxOr3Ldq9+nVi>}zQZ4zU34Xr_-LbTsL0{LP_n~<04wL*c zAc>+j^^AWO(0W^`e8=&7394@F=mrBh+E}5vrX)$~=hcKwbB9)QhY|J?AiDb^W#g_x ziysfG3E$R1oT`Fq=aeAYQUo0Plr_uxX|G+7>ham!LH+P*_I@s^?z3h)Ln#)KWX$gb zE?=7GBJIX-{29SeY&ufF3a%O*tV$jmTj`iBw5^a%yNm!-hJp%2ZM4v^4LOYQ#vxVP zaRHQZW}&4{dzc#4FHC|=G5ev;c>ql95oktFE0T8vzo|RM_e*=VuP+3!=ljAfcLn*H z)78Aq_9N@ibI&R8XEt{=l++h5Z(rd0rNXb1?(w>`)+s2|_qyVyXXnU2 z=$qdcsB-gYOX~KZ;b zk%_eeGb;rup=*@&`RT-|tdQc{*_ien(6Yjf^B%H!zjS6cJ9YY%`OgI{0{=E94NYt5C>bVd}z&jXKRFrOUy zgb#E$#4IN*Ivsf4t~PnuV;Jn~T^(}crRb0xdPfh)5qLG7Y2<_gSwDZ)hs5KcY`Q;I zvUH@J&`URMTGeVujd#g0kCup9G7<{6t+QmXJMGVVY!!{B#L|$Bi5Mu1BHF3VG%y$A zF&+DGaqONqHpFWh3}bLLWgf>2I=xji!nO$=4}V>V5jZwSL?k>kgH`W+>_nDQz$hc2 zo}{;*?qP=wFJ`DEyO)#EUM?YHUbZ&iWzx5o`MVV7{Q+)TC$$-bAij}^IId}qS0xUK zrmpN7bPRqYldU-v{pD`BZJ9W6mMF>(+i0Z1&cp~?MbFsE23f}OvhEUmx? zLxXkss?p9~R{2*hBJ=A+oQJ}4N519k%u1afNYk7|x2VNYlX_%3WSZ@kL*MBpjiW;= zLaTU!3RIj~f6PnsF z(=PjaypAW3HCb2_yh{wa%%y}7++V44|Z z_J)C}FG3Yok74?(xWiH^{ZV;?fyre+z>qzX7!W@HbYVD{Juzn;{;#iyPU$gBmr5w7nO$hJ>MMmCLEz>JUU3z4X1>GK+$1o zbYMZ6uYU(PY7op6CF8otxQtvh{LKOh+2u%CgpglfKs+lEW8Ox-7Pp@cga$EeKs#(8 zI{X{S&TD*G9D#L*{u`64UTr5P$*@6A%AS}I#m|StJ~1fVWJj5JQ^#WwsjX_v!#mrv z#7!x}+_uwQ9SKI|4F3992=#pijjGtjQkjU6#?x^_C(cz3igfv(DPVK}xh0|c>ela~ zVA&JYbK`+16G1>g8AjJPSnq8z;$L{mdoS*b8r}h5pF*t3v}}?LKIzTJlTLh9*UKh- zN!ZWp=QKcBm6a zQ*bFO)JfJFtbaaFh-2!`p%RlaMpIDy&*tv<0H={1O=&v~HkvK6 zXlO(ys#x;mw^Y-*MEU}T5~bky$!zpP0J~?H?S&_p+f}~#G$Pb0CD~U}QFcY1@0I%j zoQU>Ryp9%gAySWdI9H1q3QiSm3r4?#SoMjZ-E?5BzNur_MUF17lN%&Ync?vD;Um|A zkKkg*aW6A9mbaX%lqxE6?6uVhz|^L`LrBjt!nbV%%4ZtBM=j#-t_>lzs~`phb7XBE zqsdp{W%Mz$z;NU*kU`Cia>7Tqg{X+Va(#z~F(g{FZ7I$&j*bSxDgJm$ zRohzhHjc{D!g9QQI;8~}Hdf(|C;704B*aQ2G;|o}e`S*boaKd^9Yj^>B*z<7Ip#xz zta2e^2j*6*GN@M~2a;eyO=bLsVbt8SGQKMx__~8FmsMEc!341ZD4ucI`@P!Mm?_fT zkR5&zrcMlfMyG#OT)NNd86#FbOKv*IpPwIBRprzuJ1K*os~GQVAe$%t;L>6%PS;yT462$sw*=62G5-$y>Mg^Q*G4tC*MJ# zc7zd)?(UX) zHoMw=_?Q;rHibDy*#!H=ORJ&fL_d$2{Hd>wzk1iS)UVtN>YvwzkJ+?G+*&y;kk-KT zOx-Ct$Oi!xWS|pKzrbKQS!?GrN==YSR9={rLq^xQP-zx6mRF1NqeWOmV`8J zkr!o?sSNGy@dQsZ)oZ<=Cj7J{;mPA;IJE<*5rXws-uve9|F~u_DQzS-vfxJ}E5SXi zpE;680F}Fl89_8D%vX~cx4;^C)q@~m>T6L35Fv1f&9$~Ibp|kGU{cGCHCOsI3+2{? z3=kzoE0etB`Pzi}3~fr&0lJc)I7@PCp^W5FE`5lM_k9)*ea4^hEOXogJ|%0ZaHL?F z_>JB+KbaK8w)?*qVGZ(`tGk0i>LQvSff|>oX>&h?VR6g_T$PQCbv@GvM zT!ViuZs=uPYqqHp1dL4imsto1^@qDZ?I?$tqQUkK0VVwY5?d!geO zxw9+h?2~paYD>AIDjN^kCtj*cW*E1b@AEQ5TK8<+C#d9(HttaSRT@h;3*b#=u#p&Y ziYRxw?rvh0c#CPK7Ckykm4b|seybcYj5*zh{|#lrX|bA0P{t&%km501V+2|k;1F@% zis>=~(o7JzA?y4cE;`=kaOCQ%^pCRksif({urxs8wgukw(;@Dz)jD=N8(AZy0R(l@ z0Ik7`g!j+r2KD*RyJO9|@yNt)z0gG}-Xm-4Q+P@i(_!pcl18Rk+%y~;upZ0AH_L8@ z^_5r3WQwFO4oEQ6cQ{KuhS^ha&kyW3RzX&vFw8*3G>V*u-^?>Lc%q42-HY(ysJ#B- z`-K7d75(VR^i?sZs)ZKNX_&Rm0rRoBpW;3nhMgQ6-kkpVrjNv zb!Eo*)gM<|54ZN1T>xyff@R;US>oa9?+irjFf)D&(zl;M&WAz=@a{QrION)1@zt2p zN?zr0Woi4ipPHUJ4;9+0vd=zQg$&V)BGYUNlqm@*JJ>}c{G zga6`hbtz)_oFlz{#~^DsPQB=>yYFNwP8VOr$I|~}F={6h4I^Z)NNqn*dA#35odqza;*S>ft6Y7r!XGw6 z>{j{&X8$?vN{caov~k)LpZ2j_76J0ZGZthD+g;#rqewZ8@W+;45XY!33WzzUZ{ZJ* zH%pUaqXuTh|pn2h}VkEV=!ze|COCTi)MRFB7ukX~A>eB=TZFsGD zjmBinKXWyS{AOUKk!6@47qAo;OSve}255n35Db^?7z9*R?p_MW^yg zZej_yjDio3E&+B*x~Lol%JEbdOzpjWs#S4o25lWr#k4<{jJqyTABZV)KHV69B_80 z|7kR!XF;cZ$?{XjJq>ih@d<%M9vSaLV_rBNE&G+ ze6c^A8Du4|jw2%F%DghsZ3Gi{cBe^cV{G&OB+$N6M2c@L*GPyfW|v%=sF?8$Q(?v? zt+#?@6i6*y5jIk4a7eqz>o*P@CC6VbYmpH?PM?b^28p0p8E3VNbEp`vtvKCjOXlbSwn-eS7@H~o%hEWP?Eq#E{g?j zbz}Wk3bKPMlONoc_svz(Fb~FUie}@cCLL#pUBHtH%O~fja(U2H5CfNkm?1N4!wmwYGMw}gqNh8OUbpIc*qm2@rrra3&qV*u07RFKM)50HW zpG_6Bwww{;fj4)ldEW(s+gnZa&*r%!J=aSte)dH8QJ=OZfM8{WP$3F&9F zI;ezVv=a!*iQk1C;2JDEDLH!1?rRj(H|>g96;~CiqnB8_tIZxiZ`XnNaH>#g6Byz6 z_`dJ~i~G=`2=(NVJn(UPJvknb$+n8m1o@-8wnka~r|ktyP*$gj>oFpN$baR~F9?Oe z@^-udXmi_gc{pToHceUvN)-|W?vt}?R>R1v&rwxK-KYU_t;;wEnrwnY zK#CLX(f`|@W^-dEe8C*^AA~~V>l`eG1rh@WwuE`OH z>~9mZ4Lj(hbPI!k;vpn@YY_8-OCp`oYq&d>pD*umUpvL6byyat{dZi2&%UQOq$@AM zHmb5}CKoQ#vh2)Qiep;z_s-u=f<*Cq{;HLhn9^%513_;NABBWW{VE}()JWKgF(w~g z*}hT1`dpMX8<9ma8Ln9vMQcpBb*7A^I@oUGjtz^x2Pt!)G6< z!X+wJx?dN?7l8(9E{3=#>pbSd8kz6(i)gS^wEil51|lxiK9RR;Puk7$Ef4o3oQWHY zqNcUd;H@zM0`k`y*ZR!4JXUAmg1evre6{^~JcWT~niS@X;t7yesLVM4JDYImqGMbGj z!KB^aw$$NV=P<~uhV4h=>zVEeV3EI8qf**3-%g-o1V3G;(x&G=!k&XE5uk?7rVtyImh=F&pEfmT{zGNEtqa>wdXE7 zdFF8z0{)Yv6w-XpQ6j7+%T(&KhFLqAwOPPP*9DjX0u_U;y)kf{_N614h?sG=BU)|{ z9F=Xn47WxLav}l6?*N}j)r6v#8PpTAnPllJiqw0z%vojY??|q5Q5x4W<r%~M8bm94e z==<%J*fo1f&-y0NLZ`BjGEIv139gOO>W|S7%xay$>ixlNh52riDP@&m^ZjZRHGtyV zmi1AKQfrZO-d*tRV1~G8tJ{D=9HN21Lp0z1pd2oBQngkJc~0w6+3Xns55#a0bmc4t zjPMiYFrGc|7#OVU<-M(2nJMB<055|4%%~dv<=~7?Asdf9go*Dr zoVO@>IZf)>s)=mId`vrcDdVRpF=OFpoRf8m6X=VSaOM1pdp>)FQGaU*LIlNbF0{JJS_xss?nXitgPnbE&yK`zqYERDKrP zS~2#abj{c1ca!U{TC}PN1KdTt;lPPi7EGsc${udDMZ>flK@ZhdDs&8~JK98hL z9mKbfP2>B|1v0V+D1oX@yEM+Ctd;{PMRqmmW5Zgbw8uK(1$YzE+{`Z5iRuKejlzXV zf4o`>g1!j-eDR}r5BNhac6MAqUXIWKahSHj5&Lu{dbLjY1{irUzb=iOj)o7b7@GW* zgD9p0C}o}xt;eig#^SpcL8xT9!)|PIy?YM60B=o=Uev)?C!{`YiklM6Bd0#@A#>n4 zX?wfiW0t<+5m6glTEVqR0G|gHoBxIOd)!iAFOVn@xTb2Fln9p&iviyik?bqnbMwv5 z)5-7p=2u3=_Ey)+#rxDl%}AiMF}Ho)c#c|09$t;nISmzn=)iiNKl#3%0v0DBbH2zh zzEMO`iq!;ohHPkQxq8}nCc}2fb51J?EGo(#vE+kAM%1lDDfCzhdH%)JbOT>=nt9V| zDnUeP!3~zV+t!dzD375rj+(MO`IR-e10fQHbMW@5lKNYF`^RS^(b`gAKbVmO6^rMa zy}MKKSrjZ~N3dQtEM;fU8gATJ96aUuPe~85#TC!P>QA5XnJ40a?tWWP zkiWV*k1cy?SE#nSHffH~$^EmzliMNloMo68@}1qS1-g(&oKL#axJW=o2V)~xzseB| zh{NP^+neXD8OJ@Rz~dRfqykZHZWTq0SrSM$r(c4??Zw2A&k`NZLa@5!v1#}b_+VzA z8b1@ZYhJXXW{;@HCpbVjw2;l+*BhcIeU#3rlxWpD7U28S!6H(gKSXD2(Z$2-Kq(~# zUokFRxxQG0FH@T*)M;}|HPD-=iEy+cezp3@7G`HnvO(Gt zz@9ip)hXF=ktG8Xqpvb1#AMHiPAaXYl(#VhLP}EFrD)DY(>6X`{wgHTUv?L> zr2FGn8&DDv_A-xSrUP+5i-BV144`(_@zWWd@2-rcMJxvEcDY%ry+P+S=-e<|AlsbA zzA4bM7u{l;6O4D`*HME`CM1DN=R&a0H(+TEewg;TJ?r*i3KBi+Wlii|+xHxW#Py7p z-fq7Op`&W~BD)6j2l;09mF7V-@_P!UoHh4gpm-i!aF+nnyYAa+TG65uxo`aU8qwp( z-9r>ASgK+P4tncv4(^u_m0TeSiJbG|$&BREgx}@`QM&??5*cp7QPqFThmM3h)cp~$e|rTyql3dJk2t<;m06?VN(pQ%ZI z^MIvjev)D`Fh`3o4nj=#j2w(qh1Gglnn~WU_=137#hcRW)*qZxSTAa?Qu<@dE;5!L zKYijRStE~nvTjB=8Uc7N24c~b!m8c)?jZ!+w*lI#L>f<|x z-H|yPP}mkIL^Zb775Kd*G^&7N<4(|cSGbov53!g+NbL~(OP$M?<9PVr^I*l@=V}^W zsGxG9v34yukU?%v#GaRvf@nb~vL+T+3#Q6e^T{-?A`7HSV``?8d8L6=IbctYXD+7aU(5TEq^&}X5d#mDc7MwJRjWVAN9$-WD z?>%Q7$pS#eR?HSQcB99|Ra>ztE&T6(r=#Mp1;x*G6rC<}x#Cy3mP_fQZ5+oIAkIMy zQfItaDY5H`QeC<#^);(G^9%HE=V3XY`JKC^Q?FISUVT_ZtDGP3J9aB;H-(gUQw!^) zkAi%qagar$WIafbS2wj))b|Pr+i|oWUG~XE7+%oQ*!r93U!1(Z!cx{TUaU?H5+ePR zzu=gp{bhv=nh-cpW7GK*tGf%S56mGSSH>-MR6t^HESkou2J$vcoToSYUr1j z%mX7FNealE4c|XkOyc@1d0gXn(wYCo=Z9bLJHU5+yj(v%$vioQey}M3+#?GTD zsM$!&9=0VO9{5@Fu!9nj%YBY5=4~~}&rIAnV{C4QUo6v?Z6dCLK$4QUf|#CFhHlCp zk9>A6KB-|T9~asrq^3BsyBt~Jl2M4b`J_X$nz0ee!-FQPh(Yb#g4?PtHiN%c1|T-2 z5Wry|sB8)-PG!Pit?YK@25jI!n2#@4Q@-s_ma5EOL{Z!qu@?c)?s&y0%a6>qma#>$ zPSM?~BbDdK<}^u?<`$L(Q%)hKn}6)=ML#q*f=`hH3zMsKd}NBeoI307pQED9K9+TV zx35yF5WjhPKXpL_^l$);oYjXRajo9$@GJIMBakrf8nowJbfCB8DfW6zWAVv1rL{NuB3Ln>1#-R0dQ}ctR95mfxD`9| zCDF)I?Kh^o!h1G-UX8m6T9+!Jow=nIr}Fwz9O5MM^z%6J^*SWc?8J{v@Jpd}z8aAU zeL*uCVnuf0+^3y8N7HLFLOe?C4t0->J~C6vw5L8YW6C;E38Fa`5d(&?IR|;)iH2#l z&8^ID(~aaJV@_4tpOP39E(mIecUM#lw^`t=4axn5OLB0bU%b|+L5u6pKbC|S*2KSQ zE}+(#Eu;zwNSH1PP$zQ31xWK~G`iI2lR$&$oHV>yJO}rmW zP+h|+}@1T<=NGOUiL=ALO_$QJ=CBkqEIS&1~|R=;l(2jWvErfvRpW; zZs^W(Rb+FRabPPve}%CkTJ*_U`oSVHv?yZsHX^mjK5i9l{uTeZ3mQT~t@9H;q%FTf z7%DlX(A~J)wLRgkW|H;9;J2dq`$-eh-S95m(AXr|DmhsBs>xAGL}mrEG{M2|?|LtC z)I74l%6DH$F9mR&nXh-^=`hs;o;Lm7H$v0MCr8x*t(3M1zYP+ zY5nJ@a~@_HTSlGs4)dBSp_-zMf%Rh+R>?pknBQvb*`2gri-8uAL(?a47{Hl%*ud~= zhAKnX0{Wwth;UX`Xzfe|T{wXAZ0ViBu7Uj=7A;?i06t-!Hb&hv;}7FTt~Hb?pa$`| z$jgxN(#;c$yNnZYCFF!3qBpBmnMo}Npt?$JGFgWQrnM1Tj?KDSiKcwImp~-r2pK6u zvRL%k|JJIL^YOJPyVq*&zW5U+lQ7>_}_ zLNotqzLXR>xc^mB$9LcYJ*03zmxQyF3l8Q}d`#V38<#_WlS3R?Ap(XUPqVKB+{q(D z&run+F^+%%SVgFfKuBzAV1u+F%$ldkS|2W*p$KB!iUal*E?G{TF}XIuR+)kHYjAwO zu9uyw6U!>p{j+`p8eTCw1BYm$Cn^`Z1iL);r8Pzve|Pl=JtBTJK!IFeg8SRB?98I( z(cYj22vZYy5y) zg3)%WgQ?~^{kg#WU~iAlAlX`q6e@V?T4yae9JMgLW@#^QXYEjqPgE$A{2NJwy}C&h zZ4}-fp9|G=Df^%V=Z3b){lJS3y7E)Jk-khB3)ucJA4_m?Kn zL&5S2TP17_wkbdPUU(98;$X3{GJkS=y&D0ww4ms1@^5syzwiEBI{&h{(c^jhzCX0S zPO{7Xt2~qN!A`eJ*Vm)<|t%<^v{(0rrLVMIt%I>)-FnELTY;J zpg$KXO{0myLtc$E!w7_iqX~fGb<~{Hv2&Ry$`4BnzKK4EftMgafV!1PG#-YVT5pbDapmBZq&e{&< zeB3y>@Kn|2_fmso0A%xxS}plElloohi^|_#9`s9qR-EG%0uu(O*a(^gfafy^Y-Rwy zfpEW3i6C;CgC=Nuv-J(;>IX5GWYGmXb8m=10;09Y(Sia zdOL>&MckDzXfE{AD{qng;#oufWQO!jc}gUwM{UJ0WT`XN{^uN`K|jR)wm3qaT|d)1 zEyL--Tq`qVro_u#O&`y-u^6MguTMF(58ZQS7_#q{%dc0Bu|Qifo);2?>Tj?8M$vUQ zSc~@X4?Cz_1`-BY8wH>64-EKTdaKhX5KOIz^5Jwk*+@(7`(l!)6>a;lBe3TV*=m@c_o|?5H zhgbwW4*`Dk4PpzSbDqNH$THkFR3!_|rB-msX38HH({Uoziwwk^q4Ppt*NsGLH5mkj znD9(nx{R?)oC?2%X@jzg1lF15h3&rA@X9)fSV+pSBVSF;`>{RTB65lyN|PsP;kz#! zzPs*7&E=7)W2cLaE#wmw`4NrW$1a@I7?Cx4Y8^j0n5)3YY#32SIT$v+a|Vo|ocV(N zcktr36I8H(gMpEMEEMJ6!TZmgv7w!v)j#1ItvDw=AcXpwWKW{6G#5~G5-G+QhE4C& zLy=*6vT3uj-ndrdmVnCexVNx1%`h;zaU<)^5Ee6*;zs(j7|jC50)GOgGn#u*M4_1K zUd{PX6kiKQYY0ZDQ^u~rI@>d{uQEAdIeFnU7)n|g9{#?Uc$d)c$yW}DTPGBHla;Z1 z-b+CY>Deiilrc$Bmv@W6s!J2*(h)z_wlOJgh{B-$?(fz$Ca*OJj;7_U&KaNKQ5?YO zn7C^=+w$ug%9GzVYZikSTvyBAthjo}{@8>=%oEG#*dW~~9N}F)S_P!upFTb<8e1m; z10q1I>*GqNP)BLQV?g$Iwk>DPEUl<0clVdRD#m}4E2b-D6}l_tRdir1-Ef@$A$)Xy z;d`Ge<~%JxcTcQ4rlvf6;-~iIzl}>loRMGkVbsSD1LObOxc_#f|83%bIaO1f0CFG; zVsH)mAsDAZnJBcZjygRPD)cQmI{HvGd<%0hcr358L47u(-{lxE#~jnmHn;hfLfi^w z+<4;b60A_GF@kA&pGDphr*cmt1g3Mo%Mvx7rd@c2-8lOqEkBe0UDVlS)54qQt)Ax! zg{4#HxJ=BLRvHWcK(^n!nX6A}Z$dUXzmyFxf-c!!^Yh-=XB!JOpIUifL#%s@pvB#oPqo|)k#a#&eqw))>%)*!`{S6=N~VsN|5_tf}wt_ z&P0oni9tQLm_Q#v#~$2B9>3x}!Ax_0ogKI)or{hYELZ8i)j5-$Qn~d5t@b$UZY7aO z(Hj#&^%KyLh|ImZkrW{Hn%+z%-ISn(`Y>@AsL+SuUQXkcc)G;i=xA5we&RtOcO{qB zYpB5=-9Kq5jaR@ajSd>>J0(f{^|iFkX;$Bij7eP~q9RE9iOBw!7oU){xM*EmxNebA z(s7j#uV%03+TJ*x=sG?$Z)w+lGQJT+SsmnOv(q_eRynF{mZ+JI)z>~#_|b7Wcr-vA zl;21e`PinaF}b91ag94^{oUD%Fi>1(w9i=Hoa zoy$Oczn_u2Uv&LGEC~*Q3HJZy0Dkz)zkc68?E8N*0{=$;&B*&V77Xk=ccW;(w%k2XABfmz4h$ z7k@+l&f5QhMzjA{4*whYcNY8)(3Ru=*ZhBS;@{xEqx?VM1 exxm2wFJbjDjP*e=1OtQnI0QfPHw4c=PyZkFry%zL literal 0 HcmV?d00001 diff --git a/main.py b/main.py new file mode 100644 index 0000000..7e5cefa --- /dev/null +++ b/main.py @@ -0,0 +1,161 @@ +import copy +import re + +import requests +from bs4 import BeautifulSoup +import pandas as pd +from time import sleep +import sql + + +class Book: + def __init__(self, name, url, star, star_people, author, translater, publisher, pub_year, price, comment): + self.name = name + self.url = url + self.star = star + self.star_people = star_people + self.author = author + self.translater = translater + self.publisher = publisher + self.pub_year = pub_year + self.price = price + self.comment = comment + + def to_dict(self): + return { + '书名': self.name, + '豆瓣链接': self.url, + '作者': self.author, + '译者': self.translater, + '出版社': self.publisher, + '出版日期': self.pub_year, + '价格': self.price, + '评分': self.star, + '评分人数': self.star_people, + '一句话评价': self.comment + } + + def __str__(self): + return f"Book Info: {self.name} - {self.author} - {self.pub_year} - {self.publisher} - {self.price} - {self.star} - {self.star_people} - {self.comment}" + + +class DoubanBookTop250Crawler: + def __init__(self): + self.book_list = [] + self.book_list_data=[] + + def get_book_info(self, url, headers): + res = requests.get(url, headers=headers) + soup = BeautifulSoup(res.text, 'html.parser') + for book in soup.select('.item'): + name = book.select('.pl2 a')[0]['title'] # 书名 + url = book.select('.pl2 a')[0]['href'] # 书籍链接 + star = book.select('.rating_nums')[0].text # 书籍评分 + star_people = book.select('.pl')[1].text # 评分人数 + # 提取其他字段 + info = book.select('.pl')[0].text.split('/') + if len(info) == 5: # 正常情况 + author = info[0] + translater = info[1] + publisher = info[2] + pub_year = info[3] + price = info[4] + elif len(info) == 4: # 没有译者 + author = info[0] + translater = None + publisher = info[1] + pub_year = info[2] + price = info[3] + elif len(info) == 6: # 有2个价格 + author = info[0] + translater = info[1] + publisher = info[2] + pub_year = info[3] + price = str(info[4]) + '/' + str(info[5]) + elif len(info) == 3: # 没有作者,且没有译者 + author = None + translater = None + publisher = info[0] + pub_year = info[1] + price = str(info[2]) + else: + # 这里可以加入错误处理逻辑,比如打印错误信息 + continue # 跳过当前循环 + comment = book.select('.quote span')[0].text if book.select('.quote span') else None # 一句话评价 + book_comment = book.select('.quote span')[0].text if book.select('.quote span') else None # 一句话评价 + book_obj = Book(name, url, star, star_people, author, translater, publisher, pub_year, price, book_comment) + self.book_list.append(book_obj.to_dict()) + self.book_list_data.append(book_obj) + + + def save_to_csv(self, csv_name): + df = pd.DataFrame(self.book_list) + df.to_csv(csv_name, encoding='utf_8_sig', index=False) + + def crawl_douban_top250(self): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'} + for i in range(3): + page_url = 'https://book.douban.com/top250?start={}'.format(str(i * 25)) + print('开始爬取第{}页,地址是:{}'.format(str(i + 1), page_url)) + self.get_book_info(page_url, headers) + sleep(1) + self.save_to_csv(csv_name="BookDouban250.csv") + +# 实例化爬虫对象,并调用方法执行爬取和保存数据 +crawler = DoubanBookTop250Crawler() +crawler.crawl_douban_top250() +book_list = crawler.book_list +book_list_data = crawler.book_list_data +dataexcel={"书名":[],"豆瓣链接":[],"作者":[],"译者":[],"出版社":[],"出版日期":[],"价格":[],"评分":[],"评分人数":[],"一句话评价":[]} +for book in book_list: + book['评分人数']= book['评分人数'].replace('\n', '').strip() + book['评分人数'] = book['评分人数'].replace(' ', '').replace("(","").replace(")","").strip() + dataexcel['书名'].append(book['书名']) + dataexcel['豆瓣链接'].append(book['豆瓣链接']) + dataexcel['作者'].append(book['作者']) + dataexcel['译者'].append(book['译者']) + dataexcel['出版社'].append(book['出版社']) + dataexcel['出版日期'].append(book['出版日期']) + dataexcel['价格'].append(book['价格']) + dataexcel['评分'].append(book['评分']) + dataexcel['评分人数'].append(book['评分人数']) + dataexcel['一句话评价'].append(book['一句话评价']) +book_list_data_two_price=[] +for book in book_list_data: + book.star_people=book.star_people.replace('\n','').strip() + book.star_people=book.star_people.replace(' ','').replace("(","").replace(")","").strip() + book.star_people=''.join(filter(str.isdigit, book.star_people)) + book.price=book.price.replace('元','').strip() + if book.price.find('/')!=-1: + copy_price=book.price.split('/')[1] + book.price=book.price.split('/')[0] + copy_book=copy.deepcopy(book) + copy_book.price=copy_price + book_list_data_two_price.append(copy_book) + match=re.search(r'\d+\.\d{2}',book.price) + if match: + book.price = match.group() +book_list_data=book_list_data+book_list_data_two_price + + +print(crawler.book_list) +# 将 book_list 转换为 DataFrame +df = pd.DataFrame(dataexcel) +# 将 DataFrame 写入到 Excel 文件 +file_name = 'book_data.xlsx' +df.to_excel(file_name, index=False) +print(f"书籍数据已写入到 {file_name}") + +# 使用BookDatabase类 +db = sql.BookDatabase(host='localhost', user='root', password='123456', database='xiaosuo', table_name='books') +db.initialize_table() +db.insert_books(book_list_data) + + + + + + + + diff --git a/publisher_pie_chart.html b/publisher_pie_chart.html new file mode 100644 index 0000000..e8b60b6 --- /dev/null +++ b/publisher_pie_chart.html @@ -0,0 +1,226 @@ + + + + + Awesome-pyecharts + + + + +
+ + + diff --git a/run.py b/run.py new file mode 100644 index 0000000..3c9950c --- /dev/null +++ b/run.py @@ -0,0 +1,11 @@ +import requests +import json + + +while True: + msg = input('我:') + sess = requests.get( + ('https://open.drea.cc/bbsapi/chat/get?keyWord=' + msg + '&userName=type%3Dbbs')) + js = sess.text + js = json.loads(js) + print('微梦机器人:', js['data']['reply']) diff --git a/sql.py b/sql.py new file mode 100644 index 0000000..4e9228d --- /dev/null +++ b/sql.py @@ -0,0 +1,69 @@ +import pymysql + +class BookDatabase: + def __init__(self, host, user, password, database, table_name): + self.host = host + self.user = user + self.password = password + self.database = database + self.table_name = table_name + + def connect(self): + self.connection = pymysql.connect(host=self.host, user=self.user, password=self.password, + database=self.database, cursorclass=pymysql.cursors.DictCursor) + + def close(self): + self.connection.close() + + def table_exists(self): + with self.connection.cursor() as cursor: + check_table_query = f"SHOW TABLES LIKE '{self.table_name}'" + cursor.execute(check_table_query) + result = cursor.fetchone() + return bool(result) + + def create_table(self): + with self.connection.cursor() as cursor: + create_table_query = """ + CREATE TABLE IF NOT EXISTS books ( + id INT AUTO_INCREMENT PRIMARY KEY, + name VARCHAR(255) NOT NULL, + url VARCHAR(255), + star FLOAT, + star_people INT, + author VARCHAR(255), + translater VARCHAR(255), + publisher VARCHAR(255), + pub_year VARCHAR(20), + price FLOAT, + comment TEXT + ) + """ + cursor.execute(create_table_query) + self.connection.commit() + + def insert_books(self, booklist): + with self.connection.cursor() as cursor: + insert_query = """ + INSERT INTO{table_name} (name, url, star, star_people, author, translater, publisher, pub_year, price, comment) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + """ + for book in booklist: + cursor.execute(insert_query.format(table_name=self.table_name), + (book.name, book.url, float(book.star), int(book.star_people), + book.author, book.translater, book.publisher, + book.pub_year, float(book.price), book.comment)) + self.connection.commit() + self.close() + + def initialize_table(self): + + self.connect() + if not self.table_exists(): + self.create_table() + # self.close() + + +# # 使用BookDatabase类 +# db = BookDatabase(host='localhost', user='root', password='123456', database='xiaosuo', table_name='books') +# db.initialize_table()