parent
21f2d811d0
commit
23c04a3584
@ -1,92 +0,0 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from pandas import DataFrame
|
||||
import xlwt
|
||||
import time
|
||||
import re
|
||||
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36 Edg/92.0.902.55'}
|
||||
|
||||
# 创建 workbook 即新建 excel 文件/工作簿
|
||||
workbook = xlwt.Workbook(encoding='utf-8', style_compression=1)
|
||||
worksheet = workbook.add_sheet('题', cell_overwrite_ok=True) #创建工作表
|
||||
xuhao=1
|
||||
worksheet.write(0, 0, "题目")
|
||||
worksheet.write(0, 1, "选项+答案")
|
||||
for i in range(100,250):
|
||||
url = "https://www.hyluz.cn/search.php?q=单选题:&page={}".format(i)
|
||||
response = requests.get(url=url, headers=headers)
|
||||
html = response.content.decode('utf-8')
|
||||
text = response.text
|
||||
# print(text)
|
||||
dynasties = re.findall(r'<h2><a href="(.*?)" target="_blank"><strong><mark>单选题:</mark></strong>', text, re.DOTALL)
|
||||
for j in dynasties:
|
||||
# print(j)
|
||||
# break
|
||||
response = requests.get(url=j, headers=headers)
|
||||
html = response.content.decode('utf-8')
|
||||
text = response.text
|
||||
# print(text)
|
||||
# break
|
||||
# print(text)
|
||||
# print(html)
|
||||
# soup = BeautifulSoup(html)
|
||||
# print(soup)
|
||||
# position = soup.select('#content >div>div.post > h1> em')
|
||||
# print(position)
|
||||
|
||||
|
||||
dynasties = re.findall(r'<h1>(.*?)</h1>', text, re.DOTALL)
|
||||
print(dynasties)
|
||||
x = re.findall(r'<br>(.*?)</div>', text, re.DOTALL)
|
||||
s = x[0]
|
||||
while("<br><br>" in s):
|
||||
s = s.replace("<br><br>", "<br>")
|
||||
s = s.replace("<br>", ";\t")
|
||||
|
||||
# if "A" in l[5]:
|
||||
# w="A"
|
||||
# elif "B" in l[5]:
|
||||
# w="B"
|
||||
# elif "C" in l[5]:
|
||||
# w="C"
|
||||
# else:
|
||||
# w="D"
|
||||
|
||||
# option = re.findall(r'<br><br><br><br><br>(.*?)<br><br><br>', text, re.DOTALL)
|
||||
# print(option)
|
||||
# answer=re.findall(r'<br><br><br>(.*?)</div>', text, re.DOTALL)
|
||||
# print(answer)
|
||||
#写入数据
|
||||
worksheet.write(xuhao, 0, dynasties)
|
||||
# worksheet.write(xuhao, 1, l[1])
|
||||
# worksheet.write(xuhao, 2, l[2])
|
||||
# worksheet.write(xuhao, 3, l[3])
|
||||
# worksheet.write(xuhao, 4, l[4])
|
||||
worksheet.write(xuhao, 1, s)
|
||||
xuhao += 1
|
||||
# break
|
||||
# break
|
||||
time.sleep(1)
|
||||
worksheet.col(0).width = 256*100
|
||||
worksheet.col(1).width = 256*100
|
||||
# worksheet.col(2).width = 256*20
|
||||
# worksheet.col(3).width = 256*20
|
||||
# worksheet.col(4).width = 256*20
|
||||
# worksheet.col(5).width = 256*20
|
||||
workbook.save(r'D:\爬虫爬题3.xls')
|
||||
|
||||
# for i in name_list:
|
||||
# url = "https://so.gushiwen.cn/mingjus/default.aspx?astr={}".format(i)
|
||||
# response = requests.get(url=url, headers=headers)
|
||||
# html = response.content.decode('utf-8')
|
||||
# text = response.text
|
||||
# dynasties = re.findall(r'<a style=" float:left;" target="_blank" href="/.*?>(.*?)</a>', text, re.DOTALL)
|
||||
# # titles = re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>', text, re.DOTALL)
|
||||
# for i in range(0, len(dynasties), 2):
|
||||
# print(dynasties[i], dynasties[i + 1])
|
||||
# # 写入数据
|
||||
# worksheet.write(k, 0, dynasties[i])
|
||||
# worksheet.write(k, 1, dynasties[i+1])
|
||||
# k+=1
|
||||
|
||||
Loading…
Reference in new issue