From f801dc36260af55859b335178689abb4b55b1523 Mon Sep 17 00:00:00 2001 From: Qw37tgf5k <1879847326@qq.com> Date: Fri, 5 Nov 2021 20:27:25 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9C=80=E7=BB=88=E7=89=88=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 最终版本1.0.py | 157 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 最终版本1.0.py diff --git a/最终版本1.0.py b/最终版本1.0.py new file mode 100644 index 0000000..b8b6d52 --- /dev/null +++ b/最终版本1.0.py @@ -0,0 +1,157 @@ +# @Time : 2021/11/5 13:57 +# @Author :wenkaic +# @File : 002草稿 +# @Project : pythonProject4 +import urllib.request +from bs4 import BeautifulSoup +from urllib import parse +from wordcloud import WordCloud +import numpy as np +from PIL import Image +from lxml import etree + +#阿拉伯数字转为中文数字 +def change(num): + dic_num = {"1": "一", "2": "两", "3": "三", "4": "四", "5": "五", "6": "六", "7": "七", "8": "八", "9": "九", } + dic_unit = {1: "", 2: "十", 3: "百", 4: "千", 5: "万"} + fs = [] + daxie = "" + lennum = len(num) + if lennum >= 1: + for item in num: + if int(item) > 0: + fs.append(dic_num[item]) + fs.append(dic_unit[lennum]) + elif int(item) == 0: + fs.append("零") + lennum -= 1 + while fs[-1] == "零": + fs.pop() + daxie = "".join(fs) + if int(num)>1000 and int(num)<=1009: + daxie=daxie.replace('零','',1) + return daxie + +# 词云的制作,需要处理后的content +def Word_cloud(): + + mask = np.array(Image.open('006.png')) + fp = open('003斗破苍穹.txt', 'r', encoding='utf-8') + text = fp.read() + w = WordCloud(font_path='方正粗黑宋简体.ttf', width=650, height=700 + , contour_color='yellow', contour_width=2, mask=mask + , max_words=500) + w.generate(text) + w.to_file('004dou.png') + fp.close() + +# 保存爬取文件或者直接输出爬取的文件 +def save(n,soup): + bbs = soup.select('th,tr') + fp = open('003斗破苍穹.txt', 'a', encoding='utf-8') + if(n=='1'): + print('执行指令1') + for i in range(0, len(bbs)): + obj = bbs[i] + fp.write(obj.get_text().strip()) + elif(n=='2'): + print('执行指令2') + for i in range(0, len(bbs)): + obj = bbs[i] + print(obj.get_text().strip()) + else: + print('出现问题',n) + print('没有执行') + fp.close() +url = 'http://www.ddxs.com/doupocangqiong/' + +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.40", +} + +# 请求对象定制 +request = urllib.request.Request(url=url, headers=headers) + +# 获取处理前的相应数据 +response = urllib.request.urlopen(request) +content = response.read().decode('utf-8') + +soup = BeautifulSoup(content,'lxml') +print('输入1:保存在文件夹中\n' \ + '输入2:直接输出\n' \ + '输入3:词云(文件会下载到本地)\n' \ + '输入4:查询章节\n'\ + '输入5:显示提示信息') +while 1: + print('请输入指令:') + n = input() + + #创造词云 + if n == '1': + save(n, soup) + elif n == '2': + save(n, soup) + elif n == '3': + print('指令3执行') + save('1',soup) + Word_cloud() + + #查询章节,需要未处理的content + elif n == '4': + print('指令4执行') + tree = etree.HTML(content) + list = tree.xpath('//body//a/text()') + + # 把阿拉伯数据装变为中文数字 + num = input("\n请输入数字(1-1623):") + if int(num) < 1 or int(num) > 1623: + print("输入范围错误!\n") + continue + n = change(num) + n = '第' + n + '章' + if (n == '第一百章'): + n = '一百零章' + elif (n =='第两百零两章'): + n='第两百零二章' + elif(n=='第一千零一十一章'): + n='第一千零十一章' + print('n',n) + print(list) + for i in range(0, len(list)): + if n in list[i]: + print(list[i]) + break + + elif n=='5': + print('输入1:保存在文件夹中\n' \ + '输入2:直接输出\n' \ + '输入3:词云\n' \ + '输入4:查询章节\n' \ + '输入5:显示提示信息'\ + '输入6:退出') + elif n=='6': + print('退出程序') + break + else: + print('!!输入了错误指令!!') + print('==================') + print('输入1:保存在文件夹中\n' \ + '输入2:直接输出\n' \ + '输入3:词云\n' \ + '输入4:查询章节\n' \ + '输入5:显示提示信息' \ + '输入6:退出') + print('==================') + + +# url1 = 'http://www.ddxs.com/doupocangqiong/' +# url2 = '/doupocangqiong/1656.html' +# new = urllib.parse.urljoin(url1,url2) +# print(new) +# from urllib import parse +# +# page_url = 'http://www.ddxs.com/doupocangqiong/' +# new_url = '/doupocangqiong/1656.html' +# +# new_full_url = parse.urljoin(page_url, new_url) +# print(new_full_url)