commit 2d0a10ec2c2d991f2454d9e9854d632d556b268d Author: kun <13098278+w52020031129@user.noreply.gitee.com> Date: Mon May 27 10:49:22 2024 +0800 5-27 diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..35410ca --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# 默认忽略的文件 +/shelf/ +/workspace.xml +# 基于编辑器的 HTTP 客户端请求 +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..7c4836f --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,7 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..a44514f --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..31c3df9 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/豆瓣读书250爬虫_20220926.iml b/.idea/豆瓣读书250爬虫_20220926.iml new file mode 100644 index 0000000..36928bc --- /dev/null +++ b/.idea/豆瓣读书250爬虫_20220926.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/11_DouBan250Spider.py b/11_DouBan250Spider.py new file mode 100644 index 0000000..545babe --- /dev/null +++ b/11_DouBan250Spider.py @@ -0,0 +1,99 @@ + +import requests # 发送请求 +from bs4 import BeautifulSoup # 解析网页 +import pandas as pd # 存取csv +from time import sleep # 等待时间 + +book_name = [] # 书名 +book_url = [] # 书籍链接 +book_star = [] # 书籍评分 +book_star_people = [] # 评分人数 +book_author = [] # 书籍作者 +book_translater = [] # 书籍译者 +book_publisher = [] # 出版社 +book_pub_year = [] # 出版日期 +book_price = [] # 书籍价格 +book_comment = [] # 一句话评价 + + +def get_book_info(url, headers): + res = requests.get(url, headers=headers) + soup = BeautifulSoup(res.text, 'html.parser') + for book in soup.select('.item'): + name = book.select('.pl2 a')[0]['title'] # 书名 + book_name.append(name) + bkurl = book.select('.pl2 a')[0]['href'] # 书籍链接 + book_url.append(bkurl) + star = book.select('.rating_nums')[0].text # 书籍评分 + book_star.append(star) + star_people = book.select('.pl')[1].text # 评分人数 + star_people = star_people.strip().replace(' ', '').replace('人评价', '').replace('(\n', '').replace('\n)', + '') # 数据清洗 + book_star_people.append(star_people) + + # 没有一句话评价,比如倒数第二名,君主论 + if book.select('.quote span'): + book_comment.append(book.select('.quote span')[0].text) + else: + book_comment.append(None) + + info = book.select('.pl')[0].text.split('/') + if len(info) == 5: # 正常情况 + book_author.append(info[0]) + book_translater.append(info[1]) + book_publisher.append(info[2]) + book_pub_year.append(info[3]) + book_price.append(str(info[4])) + elif len(info) == 4: # 没有译者,比如:第一名,红楼梦 + book_author.append(info[0]) + book_translater.append(None) + book_publisher.append(info[1]) + book_pub_year.append(info[2]) + book_price.append(str(info[3])) + elif len(info) == 6: # 有2个价格,比如:第一页,福尔摩斯探案全集(上中下) + book_author.append(info[0]) + book_translater.append(info[1]) + book_publisher.append(info[2]) + book_pub_year.append(info[3]) + book_price.append(str(info[4]) + '/' + str(info[5])) + elif len(info) == 3: # 没有作者,且没有译者,比如:第5页,十万个为什么 + book_author.append(None) + book_translater.append(None) + book_publisher.append(info[0]) + book_pub_year.append(info[1]) + book_price.append(str(info[2])) + else: + pass + + +def save_to_csv(csv_name): + """ + 数据保存到csv + :return: None + """ + df = pd.DataFrame() # 初始化一个DataFrame对象 + df['书名'] = book_name + df['豆瓣链接'] = book_url + df['作者'] = book_author + df['译者'] = book_translater + df['出版社'] = book_publisher + df['出版日期'] = book_pub_year + df['价格'] = book_price + df['评分'] = book_star + df['评分人数'] = book_star_people + df['一句话评价'] = book_comment + df.to_csv(csv_name, encoding='utf_8_sig') # 将数据保存到csv文件 + + +if __name__ == "__main__": + # 定义一个请求头 + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'} + # 开始爬取豆瓣数据 + for i in range(1): # 爬取共10页,每页25条数据 + page_url = 'https://book.douban.com/top250?start={}'.format(str(i * 25)) + print('开始爬取第{}页,地址是:{}'.format(str(i + 1), page_url)) + get_book_info(page_url, headers) + sleep(1) # 等待1秒 + # 保存到csv文件 + save_to_csv(csv_name="BookDouban250.csv") diff --git a/BookDouban250.csv b/BookDouban250.csv new file mode 100644 index 0000000..26f7d86 --- /dev/null +++ b/BookDouban250.csv @@ -0,0 +1,226 @@ +书名,豆瓣链接,作者,译者,出版社,出版日期,价格,评分,评分人数,一句话评价 +红楼梦,https://book.douban.com/subject/1007305/,[清] 曹雪芹 著 ,, 人民文学出版社 , 1996-12 , 59.70元,9.6,"( + 424383人评价 + )",都云作者痴,谁解其中味? +活着,https://book.douban.com/subject/4913064/,余华 ,, 作家出版社 , 2012-8 , 20.00元,9.4,"( + 834408人评价 + )",生的苦难与伟大 +1984,https://book.douban.com/subject/4820710/,[英] 乔治·奥威尔 , 刘绍铭 , 北京十月文艺出版社 , 2010-4-1 , 28.00,9.4,"( + 284757人评价 + )",栗树荫下,我出卖你,你出卖我 +哈利·波特,https://book.douban.com/subject/24531956/,J.K.罗琳 (J.K.Rowling) , 苏农 , 人民文学出版社 , 2008-12-1 , 498.00元,9.7,"( + 107448人评价 + )",从9¾站台开始的旅程 +三体全集,https://book.douban.com/subject/6518605/,刘慈欣 ,, 重庆出版社 , 2012-1 , 168.00元,9.5,"( + 187889人评价 + )",地球往事三部曲 +百年孤独,https://book.douban.com/subject/6082808/,[哥伦比亚] 加西亚·马尔克斯 , 范晔 , 南海出版公司 , 2011-6 , 39.50元,9.3,"( + 426035人评价 + )",魔幻现实主义文学代表作 +飘,https://book.douban.com/subject/1068920/,[美国] 玛格丽特·米切尔 , 李美华 , 译林出版社 , 2000-9 , 40.00元,9.3,"( + 212637人评价 + )",革命时期的爱情,随风而逝 +动物农场,https://book.douban.com/subject/2035179/,[英] 乔治·奥威尔 , 荣如德 , 上海译文出版社 , 2007-3 , 10.00元,9.3,"( + 160188人评价 + )",太阳底下并无新事 +房思琪的初恋乐园,https://book.douban.com/subject/27614904/,林奕含 ,, 北京联合出版公司 , 2018-2 , 45.00元,9.2,"( + 380002人评价 + )",向死而生的文学绝唱 +三国演义(全二册),https://book.douban.com/subject/1019568/,[明] 罗贯中 ,, 人民文学出版社 , 1998-05 , 39.50元,9.3,"( + 168645人评价 + )",是非成败转头空 +福尔摩斯探案全集(上中下),https://book.douban.com/subject/1040211/,[英] 阿·柯南道尔 , 丁钟华 等 , 群众出版社 , 1981-8 , 53.00元/68.00元,9.3,"( + 133712人评价 + )",名侦探的代名词 +白夜行,https://book.douban.com/subject/10554308/,[日] 东野圭吾 , 刘姿君 , 南海出版公司 , 2013-1-1 , 39.50元,9.2,"( + 475686人评价 + )",一宗离奇命案牵出跨度近20年步步惊心的故事 +小王子,https://book.douban.com/subject/1084336/,[法] 圣埃克苏佩里 , 马振骋 , 人民文学出版社 , 2003-8 , 22.00元,9.1,"( + 756658人评价 + )",献给长成了大人的孩子们 +安徒生童话故事集,https://book.douban.com/subject/1046209/,(丹麦)安徒生 , 叶君健 , 人民文学出版社 , 1997-08 , 25.00元,9.3,"( + 132841人评价 + )",为了争取未来的一代 +天龙八部,https://book.douban.com/subject/1255625/,金庸 ,, 生活·读书·新知三联书店 , 1994-5 , 96.00元,9.2,"( + 133043人评价 + )",有情皆孽,无人不冤 +呐喊,https://book.douban.com/subject/1449351/,鲁迅 ,, 人民文学出版社 , 1973-3 , 0.36元,9.2,"( + 160624人评价 + )",新文学的第一声呐喊 +撒哈拉的故事,https://book.douban.com/subject/1060068/,三毛 ,, 哈尔滨出版社 , 2003-8 , 15.80元,9.2,"( + 175143人评价 + )",游荡的自由灵魂 +悉达多,https://book.douban.com/subject/26980487/,[德] 赫尔曼·黑塞 , 姜乙 , 天津人民出版社 , 2017-1 , 32.00元,9.3,"( + 105727人评价 + )", +邓小平时代,https://book.douban.com/subject/20424526/,【美】傅高义 (Ezra.F.Vogel) , 冯克利 , 生活·读书·新知三联书店 , 2013-1-18 , 88.00元,9.3,"( + 70891人评价 + )",个人命运背后的历史变局 +杀死一只知更鸟,https://book.douban.com/subject/6781808/,[美] 哈珀·李 , 高红梅 , 译林出版社 , 2012-9 , 32.00元,9.2,"( + 146983人评价 + )",有一种东西不能遵循从众原则,那就是——人的良心 +失踪的孩子,https://book.douban.com/subject/30172069/,[意] 埃莱娜·费兰特 , 陈英 , 人民文学出版社 , 2018-7 , 62.00元,9.2,"( + 81644人评价 + )",我的整个生命,只是一场为了提升社会地位的低俗斗争。 +明朝那些事儿(1-9),https://book.douban.com/subject/3674537/,当年明月 ,, 中国海关出版社 , 2009-4 , 358.20元,9.2,"( + 175139人评价 + )",不拘一格的历史书写 +新名字的故事,https://book.douban.com/subject/26986954/,[意] 埃莱娜·费兰特 , 陈英 , 人民文学出版社 , 2017-4 , 59.00元,9.2,"( + 92225人评价 + )",探索青年时代的激情、困惑、挣扎、背叛和失去 +野草,https://book.douban.com/subject/1915958/,鲁迅 ,, 人民文学出版社 , 1973-3 , 0.20元,9.5,"( + 47503人评价 + )",我以这一丛野草,在明与暗,生与死,过去与未来之际,献于友与仇,人与兽,爱者与不爱者之前作证。 +沉默的大多数,https://book.douban.com/subject/1054685/,王小波 ,, 中国青年出版社 , 1997-10 , 27.00元,9.1,"( + 151597人评价 + )",沉默是沉默者的通行证 +中国历代政治得失,https://book.douban.com/subject/1003479/,钱穆 ,, 生活·读书·新知三联书店 , 2001 , 12.00元,9.2,"( + 72960人评价 + )",一部简明的“中国政治制度史” +局外人,https://book.douban.com/subject/4908885/,[法] 阿尔贝·加缪 , 柳鸣九 , 上海译文出版社 , 2010-8 , 22.00元,9.1,"( + 240586人评价 + )",人生在世,永远也不该演戏作假 +乡土中国,https://book.douban.com/subject/1795079/,费孝通 ,, 上海人民出版社 , 2006-04-01 , 38.00,9.2,"( + 90709人评价 + )",中国乡土社会传统文化和社会结构理论研究代表作 +白鹿原,https://book.douban.com/subject/10564071/,陈忠实 ,, 人民文学出版社 , 2012-9 , 39.00元,9.2,"( + 108128人评价 + )",一轴关于我们民族灵魂的现实主义画卷 +卡拉马佐夫兄弟,https://book.douban.com/subject/25887924/,[俄] 费奥多尔·陀思妥耶夫斯基 , 荣如德 , 上海译文出版社 , 2015-2-1 , CNY 78.00,9.6,"( + 34343人评价 + )", +人类简史,https://book.douban.com/subject/25985021/,[以色列] 尤瓦尔·赫拉利 , 林俊宏 , 中信出版社 , 2014-11 , 68.00元,9.1,"( + 199868人评价 + )",跟着人类一同走过十万年 +围城,https://book.douban.com/subject/1008145/,钱锺书 ,, 人民文学出版社 , 1991-2 , 19.00,9.0,"( + 447368人评价 + )",幽默的语言和对生活深刻的观察 +彷徨,https://book.douban.com/subject/1449348/,鲁迅 ,, 人民文学出版社 , 1973-3 , 0.37元,9.3,"( + 65707人评价 + )",路漫漫其修远兮,吾将上下而求索 +平凡的世界(全三部),https://book.douban.com/subject/1200840/,路遥 ,, 人民文学出版社 , 2005-1 , 64.00元,9.0,"( + 324539人评价 + )",中国当代城乡生活全景 +罗杰疑案,https://book.douban.com/subject/21371175/,[英] 阿加莎·克里斯蒂 , 常禾 , 新星出版社 , 2013-3 , 28.00元,9.2,"( + 74837人评价 + )", +许三观卖血记,https://book.douban.com/subject/4760224/,余华 ,, 作家出版社 , 2012-9 , 24.00元,9.2,"( + 164226人评价 + )", +我与地坛,https://book.douban.com/subject/1209899/,史铁生 ,, 春风文艺出版社 , 2002-5 , 25.00元,9.2,"( + 109004人评价 + )",这是你的罪孽与福祉 +笑傲江湖(全四册),https://book.douban.com/subject/1002299/,金庸 ,, 生活·读书·新知三联书店 , 1994-5 , 76.80元,9.1,"( + 111909人评价 + )",欲练此功,必先自宫 +献给阿尔吉侬的花束,https://book.douban.com/subject/26362836/,[美] 丹尼尔·凯斯 , 陈澄和 , 广西师范大学出版社 , 2015-4 , 36.00元,9.1,"( + 109249人评价 + )",当声称能改造智能的科学实验选中心智障碍主角 +东方快车谋杀案,https://book.douban.com/subject/1827374/,[英] 阿加莎·克里斯蒂 , 陈尧光 , 人民文学出版社 , 2006-5 , 18.00元,9.1,"( + 139814人评价 + )",谋杀诡计惊人,波洛的抉择耐人寻味 +肖申克的救赎,https://book.douban.com/subject/1829226/,[美] 斯蒂芬·金 , 施寄青 , 人民文学出版社 , 2006-7 , 29.90元,9.1,"( + 121642人评价 + )",豆瓣电影Top1原著 +江城,https://book.douban.com/subject/7060185/,[美] 彼得·海斯勒 , 李雪顺 , 上海译文出版社 , 2012-1 , 36.00元,9.1,"( + 63040人评价 + )",外国人眼中的涪陵 +基督山伯爵,https://book.douban.com/subject/1085860/,[法国] 大仲马 , 周克希 , 上海译文出版社 , 1991-12-1 , 43.90元,9.1,"( + 129540人评价 + )",一个报恩复仇的故事,以法国波旁王朝和七月王朝为背景 +城南旧事,https://book.douban.com/subject/1254588/,林海音 文 ,, 中国青年出版社 , 2003-7 , 16.00元,9.1,"( + 157423人评价 + )",长亭外,古道边,芳草碧连天 +霍乱时期的爱情,https://book.douban.com/subject/10594787/,[哥伦比亚] 加西亚·马尔克斯 , 杨玲 , 南海出版公司 , 2012-9-1 , 39.50元,9.0,"( + 286118人评价 + )",义无反顾地直达爱情的核心 +故事新编,https://book.douban.com/subject/2046909/,鲁迅 ,, 人民文学出版社 , 1973-12-01 , 0.31 元,9.4,"( + 43436人评价 + )",拾取古代传说,取一点因由,随意点染 +艺术的故事,https://book.douban.com/subject/3162991/,[英] 贡布里希 (Sir E.H.Gombrich) , 范景中 , 广西美术出版社 , 2008-04 , 280.00,9.6,"( + 26388人评价 + )",从最早的洞窟绘画到当今的实验艺术 +万历十五年,https://book.douban.com/subject/1041482/,[美] 黄仁宇 ,, 生活·读书·新知三联书店 , 1997-5 , 18.00元,9.0,"( + 202254人评价 + )",见微知著,历史观的颠覆 +朝花夕拾,https://book.douban.com/subject/1449352/,鲁迅 ,, 人民文学出版社 , 1973-4 , 0.25元,9.1,"( + 155213人评价 + )",在纷扰中寻出一点闲静 +月亮和六便士,https://book.douban.com/subject/1858513/,[英] 毛姆 , 傅惟慈 , 上海译文出版社 , 2006-8 , 15.00元,9.0,"( + 209821人评价 + )",有多少人会经历顿悟,就有更少的人甘愿自我放逐 +厌女,https://book.douban.com/subject/25836270/,上野千鹤子 , 王兰 , 上海三联书店 , 2015-1 , 28.00,9.1,"( + 82066人评价 + )", +秋园,https://book.douban.com/subject/34998019/,杨本芬 ,, 北京联合出版公司 , 2020-6 , 38.00元,9.0,"( + 108614人评价 + )", +射雕英雄传,https://book.douban.com/subject/1044547/,金庸 ,, 生活·读书·新知三联书店 , 1999-04 , 47.00元,9.1,"( + 85813人评价 + )",侠之大者,为国为民 +置身事内,https://book.douban.com/subject/35546622/,兰小欢 ,, 上海人民出版社 , 2021-8 , 65.00元,9.1,"( + 80263人评价 + )", +追风筝的人,https://book.douban.com/subject/1770782/,[美] 卡勒德·胡赛尼 , 李继宏 , 上海人民出版社 , 2006-5 , 29.00元,8.9,"( + 795265人评价 + )",为你,千千万万遍 +树上的男爵,https://book.douban.com/subject/6789605/,[意大利]伊塔洛·卡尔维诺 , 吴正仪 , 译林出版社 , 2012-4-1 , 30.00元,9.1,"( + 60492人评价 + )",是不是真的只有先与人疏离,才能最终与他们在一起? +寻路中国,https://book.douban.com/subject/5414391/,[美] 彼得·海斯勒 , 李雪顺 , 上海译文出版社 , 2011-1 , 33.00元,9.0,"( + 56992人评价 + )",《纽约客》驻北京记者驾车漫游中国大陆的经历 +刀锋,https://book.douban.com/subject/2035162/,[英]毛姆 , 周煦良 , 上海译文出版社 , 2007-3 , 18.00元,9.0,"( + 92048人评价 + )",一把刀的锋刃不容易越过;因此智者说得救之道是困难的 +无人生还,https://book.douban.com/subject/3006581/,[英] 阿加莎・克里斯蒂 , 祁阿红 , 人民文学出版社 , 2008-3 , 19.00,9.0,"( + 150494人评价 + )",童谣杀人案 +格林童话全集,https://book.douban.com/subject/1043008/,[德国]格林兄弟 , 魏以新 , 人民文学出版社 , 1994-11 , 21.45元,9.1,"( + 95633人评价 + )",一本有教育意义的书 +中国少年儿童百科全书(全四册),https://book.douban.com/subject/1028409/,林崇德 主编 ,, 浙江教育出版社 , 1991-4 , 168.00元,9.4,"( + 18700人评价 + )", +鼠疫,https://book.douban.com/subject/24257229/,[法] 阿尔贝·加缪 , 刘方 , 上海译文出版社 , 2013-8 , 34.00元,9.1,"( + 79496人评价 + )",用别样的监禁生活再现某种监禁生活,与用不存在的事表现真事同等合理 +西游记(全二册),https://book.douban.com/subject/1029553/,吴承恩 , 黄肃秋 注释 , 人民文学出版社 , 2004-8 , 47.20元,9.1,"( + 89041人评价 + )",神魔皆有人情,精魅亦通世故 +嫌疑人X的献身,https://book.douban.com/subject/3211779/,[日] 东野圭吾 , 刘子倩 , 南海出版公司 , 2008-9 , 28.00,8.9,"( + 524507人评价 + )",数学好是一种极致的浪漫 +黄金时代,https://book.douban.com/subject/1089243/,王小波 ,, 花城出版社 , 1999-3 , 19.00元,8.9,"( + 166901人评价 + )",我想爱,想吃,还想在一瞬间变成天上半明半暗的云 +可能性的艺术,https://book.douban.com/subject/35819419/,刘瑜 ,, 广西师范大学出版社 , 2022-4 , 82.00元,9.2,"( + 40954人评价 + )", +傲慢与偏见,https://book.douban.com/subject/1083428/,[英] 奥斯丁 , 张玲 , 人民文学出版社 , 1993-7 , 13.00元,8.9,"( + 233395人评价 + )",所有现代言情小说的母体 +史记(全十册),https://book.douban.com/subject/1077847/,司马迁 , (索隐)司马贞,(正义)张守节 , 中华书局 , 1982-11 , 125.00,9.6,"( + 24977人评价 + )",史家之绝唱,无韵之离骚 +始于极限,https://book.douban.com/subject/35966120/,[日] 上野千鹤子 , 曹逸冰 , 新星出版社 , 2022-9-20 , 59,9.0,"( + 71582人评价 + )", +悲惨世界(上中下),https://book.douban.com/subject/1205054/,[法] 雨果 , 李丹 , 人民文学出版社 , 1992-6 , 66.00元,9.1,"( + 63925人评价 + )",现实主义与浪漫主义的至高杰作 +台北人,https://book.douban.com/subject/5337248/,白先勇 ,, 广西师范大学出版社 , 2010-10 , 38.00元,9.0,"( + 73866人评价 + )",白先勇短篇小说集 +永恒的终结,https://book.douban.com/subject/25829693/,[美] 艾萨克·阿西莫夫 , 崔正男 , 江苏凤凰文艺出版社 , 2014-9 , 32.00元,9.1,"( + 47492人评价 + )",关于时间旅行的终极奥秘和恢宏构想 +诗经,https://book.douban.com/subject/1883245/,孔丘 编订 ,, 北京出版社 , 2006-7 , 19.90元,9.5,"( + 28840人评价 + )",思无邪 +孽子,https://book.douban.com/subject/5337254/,白先勇 ,, 广西师范大学出版社 , 2010.10 , 46.00元,9.2,"( + 47750人评价 + )",写给那一群, 在最深最深的黑夜里, 独自彷徨街头, 无所依归的孩子们 +刘擎西方现代思想讲义,https://book.douban.com/subject/35313227/,刘擎 ,, 新星出版社 , 2021-1 , 79.00元,9.2,"( + 36553人评价 + )", diff --git a/GUI.py b/GUI.py new file mode 100644 index 0000000..797db5f --- /dev/null +++ b/GUI.py @@ -0,0 +1,155 @@ +from pyecharts import options as opts +from pyecharts.charts import Bar,Page,Line,Timeline +from pyecharts.commons.utils import JsCode +from pyecharts.globals import ThemeType +from pyecharts.charts import Pie +from pyecharts.faker import Faker +from main import book_list_data +from collections import Counter + +book_list_data_sortBystart = sorted(book_list_data, key=lambda x: x.star,reverse=True) +book_list_data_sortBystart=book_list_data_sortBystart[:10] +book_list_data_sortBystart=book_list_data_sortBystart[::-1] +# 从book_list_data中提取书籍名称和评分数据 +book_names = [book.name for book in book_list_data_sortBystart] +book_stars = [book.star for book in book_list_data_sortBystart] +x_data = list(range(1, len(book_names) + 1)) +# 创建柱状图 +bar = ( +Bar(init_opts=opts.InitOpts(theme="shine",width="850px",height='400px')) + .add_xaxis(book_names) + .add_yaxis("评分", book_stars,color="red") + .reversal_axis() # 实现旋转 + .set_global_opts( + title_opts=opts.TitleOpts(title="评分前10榜", pos_bottom="bottom", pos_left="center"), + xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=0)), + yaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-45)) + ) +# .render("bar_datazoom_slider.html") +) + + +publisher_list = [book.publisher for book in book_list_data] +# 使用Counter来计算每个出版商的数量 +publisher_counter = Counter(publisher_list) +publisher_names = list(publisher_counter.keys()) +publisher_counts = list(publisher_counter.values()) +colors = [ + "#5470c6", "#91cc75", "#fac858", "#ee6666", "#73c0de", + "#3ba272", "#fc8452", "#9a60b4", "#ea7ccc", "#bb60b4", + "#8B008B", "#FF1493", "#1E90FF", "#20B2AA", "#2E8B57", + "#B22222", "#FF4500", "#4682B4", "#DAA520", "#32CD32" +] + +# 使用Pyecharts创建饼图 +pie = ( + Pie() + .add("", [list(z) for z in zip(publisher_names, publisher_counts)]) + .set_colors(colors) # 设置颜色 + .set_global_opts(title_opts=opts.TitleOpts(title="出版商分布饼图", pos_left="center",pos_bottom="bottom")) + .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}")) +) + +prices = [float(book.price) for book in book_list_data ] +print(prices) +# 将价格分组到不同的区间 +price_intervals = [0, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, float('inf')] # 设置价格区间 +price_counts = [0] * (len(price_intervals) - 1) # 初始化每个区间的计数为0 +prices_intervalsStr=["0-50","50-100","100-150","150-200","200-250","250-300","300-350","350-400","400-450","450-500","500以上"] +# 统计每个价格区间的数量 +for price in prices: + for i in range(len(price_intervals) - 1): + if price>=500: + price_counts[len(price_intervals) - 1] += 1 + break + if price_intervals[i] <= price and price < price_intervals[i + 1]: + price_counts[i] += 1 + break +# print(price_counts) +# 生成价格区间与书籍数量折线图 +line = ( + Line() + .add_xaxis([str(interval) for interval in prices_intervalsStr[:-1]]) # X轴标签为价格区间 + .add_yaxis("书籍数量", price_counts, symbol="circle", is_smooth=True, markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_="max")])) + .set_global_opts(title_opts=opts.TitleOpts(title="价格区间与书籍数量折线图", pos_left="center",pos_bottom="bottom"), + xaxis_opts=opts.AxisOpts(name="价格区间"), + yaxis_opts=opts.AxisOpts(name="书籍数量"), + datazoom_opts=opts.DataZoomOpts(range_start=0, range_end=100),) +) + +# 生成时间与书籍数量折线图# 假设您的书籍数据保存在book_list_data中 +# 提取每本书的出版年份 +publish_dates = [int(book.pub_year.split('-')[0]) for book in book_list_data] +print(publish_dates) +# 使用Counter来计算每年出版的书的数量 +publish_year_counter = Counter(publish_dates) +print(publish_year_counter) +# 确保结果包含连续的年份范围,并且将缺失的年份对应的数量设为 0 +full_year_range = range(min(publish_dates), max(publish_dates) + 1) +print(full_year_range) +pub_year_counts = [(year, publish_year_counter[year]) for year in full_year_range] + +# 提取年份和对应的书籍数量 +years = [str(year) for year, count in pub_year_counts] +counts = [count for year, count in pub_year_counts] +print(pub_year_counts) +# +# # 创建动态的时间曲线图 +# line_year_count = ( +# Line() +# .add_xaxis(xaxis_data=years) +# .add_yaxis( +# series_name="出版数量", +# y_axis=counts, +# ) +# .set_global_opts( +# title_opts=opts.TitleOpts(title="每年出版书籍数量变化"), +# xaxis_opts=opts.AxisOpts(name="年份"), +# yaxis_opts=opts.AxisOpts(name="书籍数量"), +# datazoom_opts=opts.DataZoomOpts(type_="inside"), +# tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="cross"), +# ) +# ) + +# timeline = Timeline() +# for i, year in enumerate(years): +line_year_count = ( +Line() + .add_xaxis(years) + .add_yaxis( + series_name="出版书籍数量", + y_axis=counts, + markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_="max", name="最大值"), + opts.MarkPointItem(type_="min", name="最小值")]), + markline_opts=opts.MarkLineOpts(data=[opts.MarkLineItem(type_="average", name="平均值")]), + ) + .set_global_opts(title_opts=opts.TitleOpts(title="年份与出版书籍数量变化"), + xaxis_opts=opts.AxisOpts(name="年份"), + yaxis_opts=opts.AxisOpts(name="书籍数量"), + ) + + + ) +# timeline.add(line_year_count, time_point=str(year)) +# timeline.add_schema( +# play_interval=1000, # 播放的时间间隔 +# is_auto_play=False, # 是否自动播放 +# pos_left="center", # 时间轴组件的位置 +# pos_bottom="bottom", +# ) + + + +# 生成html文件(可选) +# pie.render("publisher_pie_chart.html") +# 创建一个页面 +page = Page() +# 将柱状图和饼图添加到页面中 +page.add(line_year_count) +page.add(bar) +page.add(pie) +page.add(line) +# 生成HTML文件 +page.render("book_analysis.html") + + diff --git a/bar_datazoom_slider.html b/bar_datazoom_slider.html new file mode 100644 index 0000000..74e2632 --- /dev/null +++ b/bar_datazoom_slider.html @@ -0,0 +1,195 @@ + + + + + Awesome-pyecharts + + + + + +
+ + + diff --git a/book_analysis.html b/book_analysis.html new file mode 100644 index 0000000..5980ee3 --- /dev/null +++ b/book_analysis.html @@ -0,0 +1,1130 @@ + + + + + Awesome-pyecharts + + + + + + + +
+
+ +
+ +
+ +
+ +
+ + + diff --git a/book_data.xlsx b/book_data.xlsx new file mode 100644 index 0000000..1e34d1d Binary files /dev/null and b/book_data.xlsx differ diff --git a/main.py b/main.py new file mode 100644 index 0000000..7e5cefa --- /dev/null +++ b/main.py @@ -0,0 +1,161 @@ +import copy +import re + +import requests +from bs4 import BeautifulSoup +import pandas as pd +from time import sleep +import sql + + +class Book: + def __init__(self, name, url, star, star_people, author, translater, publisher, pub_year, price, comment): + self.name = name + self.url = url + self.star = star + self.star_people = star_people + self.author = author + self.translater = translater + self.publisher = publisher + self.pub_year = pub_year + self.price = price + self.comment = comment + + def to_dict(self): + return { + '书名': self.name, + '豆瓣链接': self.url, + '作者': self.author, + '译者': self.translater, + '出版社': self.publisher, + '出版日期': self.pub_year, + '价格': self.price, + '评分': self.star, + '评分人数': self.star_people, + '一句话评价': self.comment + } + + def __str__(self): + return f"Book Info: {self.name} - {self.author} - {self.pub_year} - {self.publisher} - {self.price} - {self.star} - {self.star_people} - {self.comment}" + + +class DoubanBookTop250Crawler: + def __init__(self): + self.book_list = [] + self.book_list_data=[] + + def get_book_info(self, url, headers): + res = requests.get(url, headers=headers) + soup = BeautifulSoup(res.text, 'html.parser') + for book in soup.select('.item'): + name = book.select('.pl2 a')[0]['title'] # 书名 + url = book.select('.pl2 a')[0]['href'] # 书籍链接 + star = book.select('.rating_nums')[0].text # 书籍评分 + star_people = book.select('.pl')[1].text # 评分人数 + # 提取其他字段 + info = book.select('.pl')[0].text.split('/') + if len(info) == 5: # 正常情况 + author = info[0] + translater = info[1] + publisher = info[2] + pub_year = info[3] + price = info[4] + elif len(info) == 4: # 没有译者 + author = info[0] + translater = None + publisher = info[1] + pub_year = info[2] + price = info[3] + elif len(info) == 6: # 有2个价格 + author = info[0] + translater = info[1] + publisher = info[2] + pub_year = info[3] + price = str(info[4]) + '/' + str(info[5]) + elif len(info) == 3: # 没有作者,且没有译者 + author = None + translater = None + publisher = info[0] + pub_year = info[1] + price = str(info[2]) + else: + # 这里可以加入错误处理逻辑,比如打印错误信息 + continue # 跳过当前循环 + comment = book.select('.quote span')[0].text if book.select('.quote span') else None # 一句话评价 + book_comment = book.select('.quote span')[0].text if book.select('.quote span') else None # 一句话评价 + book_obj = Book(name, url, star, star_people, author, translater, publisher, pub_year, price, book_comment) + self.book_list.append(book_obj.to_dict()) + self.book_list_data.append(book_obj) + + + def save_to_csv(self, csv_name): + df = pd.DataFrame(self.book_list) + df.to_csv(csv_name, encoding='utf_8_sig', index=False) + + def crawl_douban_top250(self): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'} + for i in range(3): + page_url = 'https://book.douban.com/top250?start={}'.format(str(i * 25)) + print('开始爬取第{}页,地址是:{}'.format(str(i + 1), page_url)) + self.get_book_info(page_url, headers) + sleep(1) + self.save_to_csv(csv_name="BookDouban250.csv") + +# 实例化爬虫对象,并调用方法执行爬取和保存数据 +crawler = DoubanBookTop250Crawler() +crawler.crawl_douban_top250() +book_list = crawler.book_list +book_list_data = crawler.book_list_data +dataexcel={"书名":[],"豆瓣链接":[],"作者":[],"译者":[],"出版社":[],"出版日期":[],"价格":[],"评分":[],"评分人数":[],"一句话评价":[]} +for book in book_list: + book['评分人数']= book['评分人数'].replace('\n', '').strip() + book['评分人数'] = book['评分人数'].replace(' ', '').replace("(","").replace(")","").strip() + dataexcel['书名'].append(book['书名']) + dataexcel['豆瓣链接'].append(book['豆瓣链接']) + dataexcel['作者'].append(book['作者']) + dataexcel['译者'].append(book['译者']) + dataexcel['出版社'].append(book['出版社']) + dataexcel['出版日期'].append(book['出版日期']) + dataexcel['价格'].append(book['价格']) + dataexcel['评分'].append(book['评分']) + dataexcel['评分人数'].append(book['评分人数']) + dataexcel['一句话评价'].append(book['一句话评价']) +book_list_data_two_price=[] +for book in book_list_data: + book.star_people=book.star_people.replace('\n','').strip() + book.star_people=book.star_people.replace(' ','').replace("(","").replace(")","").strip() + book.star_people=''.join(filter(str.isdigit, book.star_people)) + book.price=book.price.replace('元','').strip() + if book.price.find('/')!=-1: + copy_price=book.price.split('/')[1] + book.price=book.price.split('/')[0] + copy_book=copy.deepcopy(book) + copy_book.price=copy_price + book_list_data_two_price.append(copy_book) + match=re.search(r'\d+\.\d{2}',book.price) + if match: + book.price = match.group() +book_list_data=book_list_data+book_list_data_two_price + + +print(crawler.book_list) +# 将 book_list 转换为 DataFrame +df = pd.DataFrame(dataexcel) +# 将 DataFrame 写入到 Excel 文件 +file_name = 'book_data.xlsx' +df.to_excel(file_name, index=False) +print(f"书籍数据已写入到 {file_name}") + +# 使用BookDatabase类 +db = sql.BookDatabase(host='localhost', user='root', password='123456', database='xiaosuo', table_name='books') +db.initialize_table() +db.insert_books(book_list_data) + + + + + + + + diff --git a/publisher_pie_chart.html b/publisher_pie_chart.html new file mode 100644 index 0000000..e8b60b6 --- /dev/null +++ b/publisher_pie_chart.html @@ -0,0 +1,226 @@ + + + + + Awesome-pyecharts + + + + +
+ + + diff --git a/run.py b/run.py new file mode 100644 index 0000000..3c9950c --- /dev/null +++ b/run.py @@ -0,0 +1,11 @@ +import requests +import json + + +while True: + msg = input('我:') + sess = requests.get( + ('https://open.drea.cc/bbsapi/chat/get?keyWord=' + msg + '&userName=type%3Dbbs')) + js = sess.text + js = json.loads(js) + print('微梦机器人:', js['data']['reply']) diff --git a/sql.py b/sql.py new file mode 100644 index 0000000..4e9228d --- /dev/null +++ b/sql.py @@ -0,0 +1,69 @@ +import pymysql + +class BookDatabase: + def __init__(self, host, user, password, database, table_name): + self.host = host + self.user = user + self.password = password + self.database = database + self.table_name = table_name + + def connect(self): + self.connection = pymysql.connect(host=self.host, user=self.user, password=self.password, + database=self.database, cursorclass=pymysql.cursors.DictCursor) + + def close(self): + self.connection.close() + + def table_exists(self): + with self.connection.cursor() as cursor: + check_table_query = f"SHOW TABLES LIKE '{self.table_name}'" + cursor.execute(check_table_query) + result = cursor.fetchone() + return bool(result) + + def create_table(self): + with self.connection.cursor() as cursor: + create_table_query = """ + CREATE TABLE IF NOT EXISTS books ( + id INT AUTO_INCREMENT PRIMARY KEY, + name VARCHAR(255) NOT NULL, + url VARCHAR(255), + star FLOAT, + star_people INT, + author VARCHAR(255), + translater VARCHAR(255), + publisher VARCHAR(255), + pub_year VARCHAR(20), + price FLOAT, + comment TEXT + ) + """ + cursor.execute(create_table_query) + self.connection.commit() + + def insert_books(self, booklist): + with self.connection.cursor() as cursor: + insert_query = """ + INSERT INTO{table_name} (name, url, star, star_people, author, translater, publisher, pub_year, price, comment) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + """ + for book in booklist: + cursor.execute(insert_query.format(table_name=self.table_name), + (book.name, book.url, float(book.star), int(book.star_people), + book.author, book.translater, book.publisher, + book.pub_year, float(book.price), book.comment)) + self.connection.commit() + self.close() + + def initialize_table(self): + + self.connect() + if not self.table_exists(): + self.create_table() + # self.close() + + +# # 使用BookDatabase类 +# db = BookDatabase(host='localhost', user='root', password='123456', database='xiaosuo', table_name='books') +# db.initialize_table()