|
|
|
@ -1,13 +1,55 @@
|
|
|
|
|
import re
|
|
|
|
|
import requests
|
|
|
|
|
import random
|
|
|
|
|
from util import Crawler
|
|
|
|
|
from lxml import etree
|
|
|
|
|
from datetime import datetime
|
|
|
|
|
'''这是对于数据源2的普通做法,按照爬取数据,获取词频,打印结果的步骤依次运行程序。'''
|
|
|
|
|
from selenium import webdriver
|
|
|
|
|
from selenium.webdriver.common.by import By
|
|
|
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class CrawlerDataSource1(Crawler):
|
|
|
|
|
"""包含数据1爬虫普通做法相应函数的类"""
|
|
|
|
|
def __init__(self) -> None:
|
|
|
|
|
super().__init__()
|
|
|
|
|
self.driver = self.setup_driver()
|
|
|
|
|
self.url = random.choice(self.data_source_1)
|
|
|
|
|
|
|
|
|
|
def setup_driver(self):
|
|
|
|
|
"""设置driver"""
|
|
|
|
|
option = webdriver.EdgeOptions()
|
|
|
|
|
# 添加实验性选项"detach"并设置为True
|
|
|
|
|
option.add_experimental_option("detach", True)
|
|
|
|
|
driver = webdriver.Edge(options=option)
|
|
|
|
|
return driver
|
|
|
|
|
|
|
|
|
|
def crawler_2(self) -> str:
|
|
|
|
|
"""单线程爬取数据2"""
|
|
|
|
|
driver = self.driver
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
driver.get(self.url)
|
|
|
|
|
# 等待页面加载
|
|
|
|
|
element = WebDriverWait(
|
|
|
|
|
driver, 10).until(lambda x: driver.find_element(
|
|
|
|
|
by=By.XPATH,
|
|
|
|
|
value=
|
|
|
|
|
'//*[@id="app"]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]')
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
text_content = element.text
|
|
|
|
|
|
|
|
|
|
return text_content
|
|
|
|
|
except Exception as result:
|
|
|
|
|
print(f"发现错误:{result}")
|
|
|
|
|
return ""
|
|
|
|
|
finally:
|
|
|
|
|
driver.quit()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class CrawlerDataSource2(Crawler):
|
|
|
|
|
"""包含数据2爬虫相应函数的类"""
|
|
|
|
|
"""包含数据2爬虫普通做法相应函数的类"""
|
|
|
|
|
def __init__(self) -> None:
|
|
|
|
|
super().__init__()
|
|
|
|
|
|
|
|
|
@ -34,13 +76,8 @@ class CrawlerDataSource2(Crawler):
|
|
|
|
|
return content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'''
|
|
|
|
|
这是对于数据源3的普通做法,按照爬取数据,获取词频,打印结果的步骤依次运行程序。
|
|
|
|
|
'''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class CrawlerDataSource3(Crawler):
|
|
|
|
|
"""包含数据3爬虫相应函数的类"""
|
|
|
|
|
"""包含数据3爬虫普通做法相应函数的类"""
|
|
|
|
|
def __init__(self) -> None:
|
|
|
|
|
super().__init__()
|
|
|
|
|
|
|
|
|
@ -64,7 +101,7 @@ class CrawlerDataSource3(Crawler):
|
|
|
|
|
content = content + self.crawler_3(url)
|
|
|
|
|
return content
|
|
|
|
|
|
|
|
|
|
def get_freqs_of_En(self, content: str) -> list[tuple[any, str]]:
|
|
|
|
|
def get_freqs_of_En(self, content: str) -> list[tuple[str, int]]:
|
|
|
|
|
"""获取英文文本的词频"""
|
|
|
|
|
# 获取单词
|
|
|
|
|
pattern = re.compile('[\W_]+')
|
|
|
|
@ -88,19 +125,48 @@ class CrawlerDataSource3(Crawler):
|
|
|
|
|
return word_freqs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
t0 = datetime.now()
|
|
|
|
|
def work1() -> None:
|
|
|
|
|
"""简单方法爬取数据1"""
|
|
|
|
|
cds1 = CrawlerDataSource1()
|
|
|
|
|
content = cds1.crawler_2()
|
|
|
|
|
word_freqs = cds1.get_freqs(content)
|
|
|
|
|
cds1.print_freqs(word_freqs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def work2() -> None:
|
|
|
|
|
"""简单方法爬取数据2"""
|
|
|
|
|
cds2 = CrawlerDataSource2()
|
|
|
|
|
content = cds2.get_data_2()
|
|
|
|
|
word_freqs = cds2.get_freqs(content)
|
|
|
|
|
cds2.print_freqs(word_freqs)
|
|
|
|
|
t1 = datetime.now()
|
|
|
|
|
print(f"数据2耗时:{t1-t0}")
|
|
|
|
|
|
|
|
|
|
t0 = datetime.now()
|
|
|
|
|
|
|
|
|
|
def work3() -> None:
|
|
|
|
|
"""简单方法爬取数据3"""
|
|
|
|
|
cds3 = CrawlerDataSource3()
|
|
|
|
|
content = cds3.get_data_3()
|
|
|
|
|
word_freqs = cds3.get_freqs_of_En(content)
|
|
|
|
|
cds3.print_freqs(word_freqs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
print("开始爬取数据1……")
|
|
|
|
|
t0 = datetime.now()
|
|
|
|
|
work1()
|
|
|
|
|
t1 = datetime.now()
|
|
|
|
|
print(f"数据1耗时:{t1-t0}")
|
|
|
|
|
print("数据1爬取结束。\n")
|
|
|
|
|
|
|
|
|
|
print("开始爬取数据1……")
|
|
|
|
|
t0 = datetime.now()
|
|
|
|
|
work2()
|
|
|
|
|
t1 = datetime.now()
|
|
|
|
|
print(f"数据2耗时:{t1-t0}")
|
|
|
|
|
print("数据2爬取结束。\n")
|
|
|
|
|
|
|
|
|
|
print("开始爬取数据1……")
|
|
|
|
|
t0 = datetime.now()
|
|
|
|
|
work3()
|
|
|
|
|
t1 = datetime.now()
|
|
|
|
|
print(f"数据3耗时:{t1-t0}")
|
|
|
|
|
print("数据3爬取结束。\n")
|
|
|
|
|