You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

150 lines
6.5 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import pandas as pd
from bs4 import BeautifulSoup
import os # 生成文件
from threading import Thread
from fake_useragent import UserAgent
from queue import Queue
import re
from time import sleep, strftime
from requests.adapters import HTTPAdapter
from urllib3.util import Retry
import sys
def request_with_retry(url, headers, max_retries = 3, backoff_factor = 0.5, status_forcelist = None): # 爬虫重传设计
if status_forcelist is None:
status_forcelist = [500, 502, 503, 504]
session = requests.Session()
retries = Retry(total = max_retries,
backoff_factor = backoff_factor,
status_forcelist = status_forcelist,
method_whitelist = ["GET", "POST"])
adapter = HTTPAdapter(max_retries = retries)
session.mount('http://', adapter)
session.mount('https://', adapter)
try:
with session as http:
response = http.get(url, headers = headers)
response.raise_for_status() # 如果请求返回的是4XX, 5XX响应码将引发HTTPError异常
return response
except requests.exceptions.HTTPError as e:
print(f"HTTP Error: {e}")
sys.exit
except requests.exceptions.ConnectionError as e: # C x-> S
print(f"Connection Error: {e}")
sys.exit
except requests.exceptions.Timeout as e:
print(f"Timeout Error: {e}")
sys.exit
except requests.exceptions.RequestException as e: # S x-> C
# 请求出现异常,不知道具体原因
print(f"Request Exception: {e}")
sys.exit
# print(strftime('%Y-%m-%d %H:%M:%S'))
global url_queue_rank
def mkfile_ranklist(file_name): # 开辟新道路
if not os.path.exists(file_name):
os.makedirs(file_name)
def spider_top():
global url_queue_rank
type_list = []
rate_list = []
name_list = []
id_list = []
while not url_queue_rank.empty(): # 有坑,会卡进程,但是影响不大
url = url_queue_rank.get(timeout = 1)
headers = {
'User-Agent': UserAgent().chrome
}
try:
req1 = request_with_retry(url, headers = headers)
req = BeautifulSoup(req1.text, 'html.parser')
for num in range(2, 41):
type_list.append(
req.select(f'#articlelist > ul:nth-child(3) > li:nth-child({num}) > span.l1')[0].text) # 小说类型
name_pre = req.select(f'#articlelist > ul:nth-child(3) > li:nth-child({num}) > span.l2')[0].text # 小说名
name = re.sub(r'[\\/:*?"<>|]', '', name_pre) # 过滤字符
name_list.append(name)
Denominatorreq = int(
req.select(f'#articlelist > ul:nth-child(3) > li:nth-child({num}) > span.l5')[0].text[:-1])
Numerator = int(req.select(f'#articlelist > ul:nth-child(3) > li:nth-child({num}) > span.l6')[0].text)
Quotient = Numerator * 100 / Denominatorreq # 推荐数
rate_list.append(Quotient)
id_list.append((
req.select(f'#articlelist > ul:nth-child(3) > li:nth-child({num}) > span.l2 > a')[0].get(
'href').split('/')[-2])[5:]) # 小说id
except Exception as e:
print(e)
dataframe = pd.DataFrame({'小说类型': type_list, '小说书名': name_list, '推荐值': rate_list, '小说id': id_list})
# 将DataFrame存储为csv,index表示是否显示行名default=True
dataframe.to_csv("end\\排名\\test.csv", mode = 'a', index = False)
# end\排名
# print(name_list,rate_list,type_list)
def data_sort():
df = pd.read_csv('end\\排名\\test.csv')
# 读取csv文件
df.dropna(axis = 0, how = 'any', subset = ['小说书名', '小说类型', '推荐值', '小说id'], inplace = True)
# 去空
df.drop_duplicates(subset = ['小说书名'], inplace = True)
# 对csv中name列进行去重删除出现第二次重复的行inplace=True表示直接在原来的DataFrame上删除重复项
df.drop(df[df['小说类型'] == '小说类型'].index, inplace = True)
# 其中column_name是要筛选的列名some_value是要删除的值。inplace=True表示在原始数据框中进行修改。
df.to_csv('F:\\scrapyer\\end\\排名\\test.csv', index = False)
def Queue_Spider_start(num): # 为了保证队列正常,比较对于线程这个无法作为参数
global url_queue_rank
url_queue_rank = Queue()
print(f'正在爬取1-{num}页排名')
for page in range(1, num + 1): # 爬几页 - 5
url = f'https://www.qb5.ch/top/allvisit/{str(page)}.html'
url_queue_rank.put(url)
threads = []
for i in range(4): # 线程数量 3
print(f'线程{i}启动')
t1 = Thread(target = spider_top)
t1.start()
threads.append(t1)
for thread in threads: # 重要,否则会干扰后面的去重新写
thread.join()
'''#使用Retry类来定义重试策略。这里可以指定重试次数、状态码集合、异常类型等。
retries = Retry(
total=5, # 总重试次数
backoff_factor=1, # 指数退避因子,backoff_factor 是库用来确定重试之间的休眠间隔的基值。
#{backoff factor} * (2 ** ({number of previous retries}))计算式子 =1 -> 1,2,4,8,16
status_forcelist=[500, 502, 503, 504], # 指定哪些状态码触发重试
allowed_methods=frozenset(['GET', 'POST']), # 允许重试的HTTP方法
)
# 创建HTTPAdapter并配置重试
adapter = HTTPAdapter(max_retries=retries)
# 挂载Adapter到Session
# 创建Session对象并为HTTP和HTTPS请求挂载上面创建的adapter。
session = requests.Session()
#Registers a connection adapter to a prefix.
session.mount('http://', adapter)
session.mount('https://', adapter)
'''
'''
session.mount("http://",adapter)是requests 库中的一个方法,用于为指定的协议挂载一个适配器。其中,session是一个
requests.Session 对象,adapter 则是一个适配器对象,通常是requests.adapters.HTTPAdapter 的实例。
这个方法的作用是为会话对象 session 中的所有HTTP请求都设置一个默认的适配器,以便在请求过程中自动处理连接池和重试等细节。
在这个例子中,我们使用 mount 方法为HTTP协议指定了一个适配器 adapter,意味着 session 对象中的所有HTTP请求都将使用这个
适配器来处理连接和重试。
'''