From 524a5e1e53f7cf65a1abcb154926fa619721351e Mon Sep 17 00:00:00 2001 From: p892kxaor <1987665756@qq.com> Date: Fri, 25 Nov 2022 20:15:36 +0800 Subject: [PATCH] ADD file via upload --- 爬虫爬题.py | 92 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 爬虫爬题.py diff --git a/爬虫爬题.py b/爬虫爬题.py new file mode 100644 index 0000000..abaf855 --- /dev/null +++ b/爬虫爬题.py @@ -0,0 +1,92 @@ +import requests +from bs4 import BeautifulSoup +from pandas import DataFrame +import xlwt +import time +import re + +headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36 Edg/92.0.902.55'} + +# 创建 workbook 即新建 excel 文件/工作簿 +workbook = xlwt.Workbook(encoding='utf-8', style_compression=1) +worksheet = workbook.add_sheet('题', cell_overwrite_ok=True) #创建工作表 +xuhao=1 +worksheet.write(0, 0, "题目") +worksheet.write(0, 1, "选项+答案") +for i in range(100,250): + url = "https://www.hyluz.cn/search.php?q=单选题:&page={}".format(i) + response = requests.get(url=url, headers=headers) + html = response.content.decode('utf-8') + text = response.text + # print(text) + dynasties = re.findall(r'

单选题:', text, re.DOTALL) + for j in dynasties: + # print(j) + # break + response = requests.get(url=j, headers=headers) + html = response.content.decode('utf-8') + text = response.text + # print(text) + # break + # print(text) + # print(html) + # soup = BeautifulSoup(html) + # print(soup) + # position = soup.select('#content >div>div.post > h1> em') + # print(position) + + + dynasties = re.findall(r'

(.*?)

', text, re.DOTALL) + print(dynasties) + x = re.findall(r'
(.*?)', text, re.DOTALL) + s = x[0] + while("

" in s): + s = s.replace("

", "
") + s = s.replace("
", ";\t") + + # if "A" in l[5]: + # w="A" + # elif "B" in l[5]: + # w="B" + # elif "C" in l[5]: + # w="C" + # else: + # w="D" + + # option = re.findall(r'




(.*?)


', text, re.DOTALL) + # print(option) + # answer=re.findall(r'


(.*?)', text, re.DOTALL) + # print(answer) + #写入数据 + worksheet.write(xuhao, 0, dynasties) + # worksheet.write(xuhao, 1, l[1]) + # worksheet.write(xuhao, 2, l[2]) + # worksheet.write(xuhao, 3, l[3]) + # worksheet.write(xuhao, 4, l[4]) + worksheet.write(xuhao, 1, s) + xuhao += 1 + # break + # break + time.sleep(1) +worksheet.col(0).width = 256*100 +worksheet.col(1).width = 256*100 +# worksheet.col(2).width = 256*20 +# worksheet.col(3).width = 256*20 +# worksheet.col(4).width = 256*20 +# worksheet.col(5).width = 256*20 +workbook.save(r'D:\爬虫爬题3.xls') + +# for i in name_list: +# url = "https://so.gushiwen.cn/mingjus/default.aspx?astr={}".format(i) +# response = requests.get(url=url, headers=headers) +# html = response.content.decode('utf-8') +# text = response.text +# dynasties = re.findall(r'
.*?(.*?)', text, re.DOTALL) +# for i in range(0, len(dynasties), 2): +# print(dynasties[i], dynasties[i + 1]) +# # 写入数据 +# worksheet.write(k, 0, dynasties[i]) +# worksheet.write(k, 1, dynasties[i+1]) +# k+=1 +