Update 000 普通做法.py

dev
pcz4qfnkl 11 months ago
parent b833cc4236
commit a885e6b63e

@ -1,13 +1,55 @@
import re
import requests
import random
from util import Crawler
from lxml import etree
from datetime import datetime
'''这是对于数据源2的普通做法按照爬取数据获取词频打印结果的步骤依次运行程序。'''
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
class CrawlerDataSource1(Crawler):
"""包含数据1爬虫普通做法相应函数的类"""
def __init__(self) -> None:
super().__init__()
self.driver = self.setup_driver()
self.url = random.choice(self.data_source_1)
def setup_driver(self):
"""设置driver"""
option = webdriver.EdgeOptions()
# 添加实验性选项"detach"并设置为True
option.add_experimental_option("detach", True)
driver = webdriver.Edge(options=option)
return driver
def crawler_2(self) -> str:
"""单线程爬取数据2"""
driver = self.driver
try:
driver.get(self.url)
# 等待页面加载
element = WebDriverWait(
driver, 10).until(lambda x: driver.find_element(
by=By.XPATH,
value=
'//*[@id="app"]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]')
)
text_content = element.text
return text_content
except Exception as result:
print(f"发现错误:{result}")
return ""
finally:
driver.quit()
class CrawlerDataSource2(Crawler):
"""包含数据2爬虫相应函数的类"""
"""包含数据2爬虫普通做法相应函数的类"""
def __init__(self) -> None:
super().__init__()
@ -34,13 +76,8 @@ class CrawlerDataSource2(Crawler):
return content
'''
这是对于数据源3的普通做法按照爬取数据获取词频打印结果的步骤依次运行程序
'''
class CrawlerDataSource3(Crawler):
"""包含数据3爬虫相应函数的类"""
"""包含数据3爬虫普通做法相应函数的类"""
def __init__(self) -> None:
super().__init__()
@ -64,7 +101,7 @@ class CrawlerDataSource3(Crawler):
content = content + self.crawler_3(url)
return content
def get_freqs_of_En(self, content: str) -> list[tuple[any, str]]:
def get_freqs_of_En(self, content: str) -> list[tuple[str, int]]:
"""获取英文文本的词频"""
# 获取单词
pattern = re.compile('[\W_]+')
@ -88,19 +125,48 @@ class CrawlerDataSource3(Crawler):
return word_freqs
if __name__ == '__main__':
t0 = datetime.now()
def work1() -> None:
"""简单方法爬取数据1"""
cds1 = CrawlerDataSource1()
content = cds1.crawler_2()
word_freqs = cds1.get_freqs(content)
cds1.print_freqs(word_freqs)
def work2() -> None:
"""简单方法爬取数据2"""
cds2 = CrawlerDataSource2()
content = cds2.get_data_2()
word_freqs = cds2.get_freqs(content)
cds2.print_freqs(word_freqs)
t1 = datetime.now()
print(f"数据2耗时:{t1-t0}")
t0 = datetime.now()
def work3() -> None:
"""简单方法爬取数据3"""
cds3 = CrawlerDataSource3()
content = cds3.get_data_3()
word_freqs = cds3.get_freqs_of_En(content)
cds3.print_freqs(word_freqs)
if __name__ == '__main__':
print("开始爬取数据1……")
t0 = datetime.now()
work1()
t1 = datetime.now()
print(f"数据1耗时:{t1-t0}")
print("数据1爬取结束。\n")
print("开始爬取数据1……")
t0 = datetime.now()
work2()
t1 = datetime.now()
print(f"数据2耗时:{t1-t0}")
print("数据2爬取结束。\n")
print("开始爬取数据1……")
t0 = datetime.now()
work3()
t1 = datetime.now()
print(f"数据3耗时:{t1-t0}")
print("数据3爬取结束。\n")

Loading…
Cancel
Save