# @Time : 2021/11/5 13:57 # @Author :wenkaic # @File : 002草稿 # @Project : pythonProject4 import urllib.request from bs4 import BeautifulSoup from urllib import parse from wordcloud import WordCloud import numpy as np from PIL import Image from lxml import etree #阿拉伯数字转为中文数字 def change(num): dic_num = {"1": "一", "2": "两", "3": "三", "4": "四", "5": "五", "6": "六", "7": "七", "8": "八", "9": "九", } dic_unit = {1: "", 2: "十", 3: "百", 4: "千", 5: "万"} fs = [] daxie = "" lennum = len(num) if lennum >= 1: for item in num: if int(item) > 0: fs.append(dic_num[item]) fs.append(dic_unit[lennum]) elif int(item) == 0: fs.append("零") lennum -= 1 while fs[-1] == "零": fs.pop() daxie = "".join(fs) if int(num)>1000 and int(num)<=1009: daxie=daxie.replace('零','',1) return daxie # 词云的制作,需要处理后的content def Word_cloud(): mask = np.array(Image.open('006.png')) fp = open('003斗破苍穹.txt', 'r', encoding='utf-8') text = fp.read() w = WordCloud(font_path='方正粗黑宋简体.ttf', width=650, height=700 , contour_color='yellow', contour_width=2, mask=mask , max_words=500) w.generate(text) w.to_file('004dou.png') fp.close() # 保存爬取文件或者直接输出爬取的文件 def save(n,soup): bbs = soup.select('th,tr') fp = open('003斗破苍穹.txt', 'a', encoding='utf-8') if(n=='1'): print('执行指令1') for i in range(0, len(bbs)): obj = bbs[i] fp.write(obj.get_text().strip()) elif(n=='2'): print('执行指令2') for i in range(0, len(bbs)): obj = bbs[i] print(obj.get_text().strip()) else: print('出现问题',n) print('没有执行') fp.close() url = 'http://www.ddxs.com/doupocangqiong/' headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.40", } # 请求对象定制 request = urllib.request.Request(url=url, headers=headers) # 获取处理前的相应数据 response = urllib.request.urlopen(request) content = response.read().decode('utf-8') soup = BeautifulSoup(content,'lxml') print('输入1:保存在文件夹中\n' \ '输入2:直接输出\n' \ '输入3:词云(文件会下载到本地)\n' \ '输入4:查询章节\n'\ '输入5:显示提示信息') while 1: print('请输入指令:') n = input() #创造词云 if n == '1': save(n, soup) elif n == '2': save(n, soup) elif n == '3': print('指令3执行') save('1',soup) Word_cloud() #查询章节,需要未处理的content elif n == '4': print('指令4执行') tree = etree.HTML(content) list = tree.xpath('//body//a/text()') # 把阿拉伯数据装变为中文数字 num = input("\n请输入数字(1-1623):") if int(num) < 1 or int(num) > 1623: print("输入范围错误!\n") continue n = change(num) n = '第' + n + '章' if (n == '第一百章'): n = '一百零章' elif (n =='第两百零两章'): n='第两百零二章' elif(n=='第一千零一十一章'): n='第一千零十一章' print('n',n) print(list) for i in range(0, len(list)): if n in list[i]: print(list[i]) break elif n=='5': print('输入1:保存在文件夹中\n' \ '输入2:直接输出\n' \ '输入3:词云\n' \ '输入4:查询章节\n' \ '输入5:显示提示信息'\ '输入6:退出') elif n=='6': print('退出程序') break else: print('!!输入了错误指令!!') print('==================') print('输入1:保存在文件夹中\n' \ '输入2:直接输出\n' \ '输入3:词云\n' \ '输入4:查询章节\n' \ '输入5:显示提示信息' \ '输入6:退出') print('==================') # url1 = 'http://www.ddxs.com/doupocangqiong/' # url2 = '/doupocangqiong/1656.html' # new = urllib.parse.urljoin(url1,url2) # print(new) # from urllib import parse # # page_url = 'http://www.ddxs.com/doupocangqiong/' # new_url = '/doupocangqiong/1656.html' # # new_full_url = parse.urljoin(page_url, new_url) # print(new_full_url)