From a885e6b63e70d3be6a03f9938002a286fcab630f Mon Sep 17 00:00:00 2001 From: pcz4qfnkl <1928814540@qq.com> Date: Fri, 22 Mar 2024 18:01:51 +0800 Subject: [PATCH] =?UTF-8?q?Update=20000=20=E6=99=AE=E9=80=9A=E5=81=9A?= =?UTF-8?q?=E6=B3=95.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 20 高性能模式/000 普通做法.py | 94 ++++++++++++++++++++++---- 1 file changed, 80 insertions(+), 14 deletions(-) diff --git a/20 高性能模式/000 普通做法.py b/20 高性能模式/000 普通做法.py index 91d1d80..e00e08f 100644 --- a/20 高性能模式/000 普通做法.py +++ b/20 高性能模式/000 普通做法.py @@ -1,13 +1,55 @@ import re import requests +import random from util import Crawler from lxml import etree from datetime import datetime -'''这是对于数据源2的普通做法,按照爬取数据,获取词频,打印结果的步骤依次运行程序。''' +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait + + +class CrawlerDataSource1(Crawler): + """包含数据1爬虫普通做法相应函数的类""" + def __init__(self) -> None: + super().__init__() + self.driver = self.setup_driver() + self.url = random.choice(self.data_source_1) + + def setup_driver(self): + """设置driver""" + option = webdriver.EdgeOptions() + # 添加实验性选项"detach"并设置为True + option.add_experimental_option("detach", True) + driver = webdriver.Edge(options=option) + return driver + + def crawler_2(self) -> str: + """单线程爬取数据2""" + driver = self.driver + + try: + driver.get(self.url) + # 等待页面加载 + element = WebDriverWait( + driver, 10).until(lambda x: driver.find_element( + by=By.XPATH, + value= + '//*[@id="app"]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]') + ) + + text_content = element.text + + return text_content + except Exception as result: + print(f"发现错误:{result}") + return "" + finally: + driver.quit() class CrawlerDataSource2(Crawler): - """包含数据2爬虫相应函数的类""" + """包含数据2爬虫普通做法相应函数的类""" def __init__(self) -> None: super().__init__() @@ -34,13 +76,8 @@ class CrawlerDataSource2(Crawler): return content -''' - 这是对于数据源3的普通做法,按照爬取数据,获取词频,打印结果的步骤依次运行程序。 -''' - - class CrawlerDataSource3(Crawler): - """包含数据3爬虫相应函数的类""" + """包含数据3爬虫普通做法相应函数的类""" def __init__(self) -> None: super().__init__() @@ -64,7 +101,7 @@ class CrawlerDataSource3(Crawler): content = content + self.crawler_3(url) return content - def get_freqs_of_En(self, content: str) -> list[tuple[any, str]]: + def get_freqs_of_En(self, content: str) -> list[tuple[str, int]]: """获取英文文本的词频""" # 获取单词 pattern = re.compile('[\W_]+') @@ -88,19 +125,48 @@ class CrawlerDataSource3(Crawler): return word_freqs -if __name__ == '__main__': - t0 = datetime.now() +def work1() -> None: + """简单方法爬取数据1""" + cds1 = CrawlerDataSource1() + content = cds1.crawler_2() + word_freqs = cds1.get_freqs(content) + cds1.print_freqs(word_freqs) + + +def work2() -> None: + """简单方法爬取数据2""" cds2 = CrawlerDataSource2() content = cds2.get_data_2() word_freqs = cds2.get_freqs(content) cds2.print_freqs(word_freqs) - t1 = datetime.now() - print(f"数据2耗时:{t1-t0}") - t0 = datetime.now() + +def work3() -> None: + """简单方法爬取数据3""" cds3 = CrawlerDataSource3() content = cds3.get_data_3() word_freqs = cds3.get_freqs_of_En(content) cds3.print_freqs(word_freqs) + + +if __name__ == '__main__': + print("开始爬取数据1……") + t0 = datetime.now() + work1() + t1 = datetime.now() + print(f"数据1耗时:{t1-t0}") + print("数据1爬取结束。\n") + + print("开始爬取数据1……") + t0 = datetime.now() + work2() + t1 = datetime.now() + print(f"数据2耗时:{t1-t0}") + print("数据2爬取结束。\n") + + print("开始爬取数据1……") + t0 = datetime.now() + work3() t1 = datetime.now() print(f"数据3耗时:{t1-t0}") + print("数据3爬取结束。\n")