diff --git a/spider_ranklist.py b/spider_ranklist.py new file mode 100644 index 0000000..c6270a6 --- /dev/null +++ b/spider_ranklist.py @@ -0,0 +1,149 @@ +import requests +import pandas as pd +from bs4 import BeautifulSoup +import os # 生成文件 +from threading import Thread +from fake_useragent import UserAgent +from queue import Queue +import re +from time import sleep, strftime +from requests.adapters import HTTPAdapter +from urllib3.util import Retry +import sys + + +def request_with_retry(url, headers, max_retries = 3, backoff_factor = 0.5, status_forcelist = None): # 爬虫重传设计 + if status_forcelist is None: + status_forcelist = [500, 502, 503, 504] + + session = requests.Session() + retries = Retry(total = max_retries, + backoff_factor = backoff_factor, + status_forcelist = status_forcelist, + method_whitelist = ["GET", "POST"]) + adapter = HTTPAdapter(max_retries = retries) + session.mount('http://', adapter) + session.mount('https://', adapter) + try: + with session as http: + response = http.get(url, headers = headers) + response.raise_for_status() # 如果请求返回的是4XX, 5XX响应码,将引发HTTPError异常 + return response + except requests.exceptions.HTTPError as e: + print(f"HTTP Error: {e}") + sys.exit + except requests.exceptions.ConnectionError as e: # C x-> S + print(f"Connection Error: {e}") + sys.exit + except requests.exceptions.Timeout as e: + print(f"Timeout Error: {e}") + sys.exit + except requests.exceptions.RequestException as e: # S x-> C + # 请求出现异常,不知道具体原因 + print(f"Request Exception: {e}") + sys.exit + # print(strftime('%Y-%m-%d %H:%M:%S')) + + +global url_queue_rank + + +def mkfile_ranklist(file_name): # 开辟新道路 + if not os.path.exists(file_name): + os.makedirs(file_name) + + +def spider_top(): + global url_queue_rank + type_list = [] + rate_list = [] + name_list = [] + id_list = [] + while not url_queue_rank.empty(): # 有坑,会卡进程,但是影响不大 + url = url_queue_rank.get(timeout = 1) + headers = { + 'User-Agent': UserAgent().chrome + } + try: + req1 = request_with_retry(url, headers = headers) + req = BeautifulSoup(req1.text, 'html.parser') + for num in range(2, 41): + type_list.append( + req.select(f'#articlelist > ul:nth-child(3) > li:nth-child({num}) > span.l1')[0].text) # 小说类型 + name_pre = req.select(f'#articlelist > ul:nth-child(3) > li:nth-child({num}) > span.l2')[0].text # 小说名 + name = re.sub(r'[\\/:*?"<>|]', '', name_pre) # 过滤字符 + name_list.append(name) + Denominatorreq = int( + req.select(f'#articlelist > ul:nth-child(3) > li:nth-child({num}) > span.l5')[0].text[:-1]) + Numerator = int(req.select(f'#articlelist > ul:nth-child(3) > li:nth-child({num}) > span.l6')[0].text) + Quotient = Numerator * 100 / Denominatorreq # 推荐数 + rate_list.append(Quotient) + id_list.append(( + req.select(f'#articlelist > ul:nth-child(3) > li:nth-child({num}) > span.l2 > a')[0].get( + 'href').split('/')[-2])[5:]) # 小说id + except Exception as e: + print(e) + dataframe = pd.DataFrame({'小说类型': type_list, '小说书名': name_list, '推荐值': rate_list, '小说id': id_list}) + # 将DataFrame存储为csv,index表示是否显示行名,default=True + dataframe.to_csv("end\\排名\\test.csv", mode = 'a', index = False) + # end\排名 + # print(name_list,rate_list,type_list) + + +def data_sort(): + df = pd.read_csv('end\\排名\\test.csv') + # 读取csv文件 + df.dropna(axis = 0, how = 'any', subset = ['小说书名', '小说类型', '推荐值', '小说id'], inplace = True) + # 去空 + df.drop_duplicates(subset = ['小说书名'], inplace = True) + # 对csv中name列进行去重,删除出现第二次重复的行,inplace=True表示直接在原来的DataFrame上删除重复项 + df.drop(df[df['小说类型'] == '小说类型'].index, inplace = True) + # 其中column_name是要筛选的列名,some_value是要删除的值。inplace=True表示在原始数据框中进行修改。 + df.to_csv('F:\\scrapyer\\end\\排名\\test.csv', index = False) + + +def Queue_Spider_start(num): # 为了保证队列正常,比较对于线程这个无法作为参数 + global url_queue_rank + url_queue_rank = Queue() + print(f'正在爬取1-{num}页排名') + for page in range(1, num + 1): # 爬几页 - 5 + url = f'https://www.qb5.ch/top/allvisit/{str(page)}.html' + url_queue_rank.put(url) + threads = [] + for i in range(4): # 线程数量 3 + print(f'线程{i}启动') + t1 = Thread(target = spider_top) + t1.start() + threads.append(t1) + for thread in threads: # 重要,否则会干扰后面的去重新写 + thread.join() + + +'''#使用Retry类来定义重试策略。这里可以指定重试次数、状态码集合、异常类型等。 +retries = Retry( + total=5, # 总重试次数 + backoff_factor=1, # 指数退避因子,backoff_factor 是库用来确定重试之间的休眠间隔的基值。 + #{backoff factor} * (2 ** ({number of previous retries}))计算式子 =1 -> 1,2,4,8,16 + status_forcelist=[500, 502, 503, 504], # 指定哪些状态码触发重试 + allowed_methods=frozenset(['GET', 'POST']), # 允许重试的HTTP方法 +) + +# 创建HTTPAdapter并配置重试 +adapter = HTTPAdapter(max_retries=retries) + +# 挂载Adapter到Session +# 创建Session对象,并为HTTP和HTTPS请求挂载上面创建的adapter。 + +session = requests.Session() +#Registers a connection adapter to a prefix. +session.mount('http://', adapter) +session.mount('https://', adapter) +''' +''' +session.mount("http://",adapter)是requests 库中的一个方法,用于为指定的协议挂载一个适配器。其中,session是一个 +requests.Session 对象,adapter 则是一个适配器对象,通常是requests.adapters.HTTPAdapter 的实例。 + +这个方法的作用是为会话对象 session 中的所有HTTP请求都设置一个默认的适配器,以便在请求过程中自动处理连接池和重试等细节。 +在这个例子中,我们使用 mount 方法为HTTP协议指定了一个适配器 adapter,意味着 session 对象中的所有HTTP请求都将使用这个 +适配器来处理连接和重试。 +'''