|
|
|
@ -0,0 +1,106 @@
|
|
|
|
|
import re
|
|
|
|
|
import requests
|
|
|
|
|
from util import Crawler
|
|
|
|
|
from lxml import etree
|
|
|
|
|
from datetime import datetime
|
|
|
|
|
'''这是对于数据源2的普通做法,按照爬取数据,获取词频,打印结果的步骤依次运行程序。'''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class CrawlerDataSource2(Crawler):
|
|
|
|
|
"""包含数据2爬虫相应函数的类"""
|
|
|
|
|
def __init__(self) -> None:
|
|
|
|
|
super().__init__()
|
|
|
|
|
|
|
|
|
|
def crawler_2(self, url: str):
|
|
|
|
|
"""单线程爬取数据2"""
|
|
|
|
|
try:
|
|
|
|
|
response = requests.post(url, headers=self.get_headers())
|
|
|
|
|
html = response.content.decode('utf-8')
|
|
|
|
|
tree = etree.HTML(html)
|
|
|
|
|
text = tree.xpath('/html/body/div/div/div[4]/div[1]/p/text()')[0]
|
|
|
|
|
return text
|
|
|
|
|
except Exception as result:
|
|
|
|
|
print(f"发现错误: {result}")
|
|
|
|
|
# time.sleep(3)
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
def get_data_2(self) -> str:
|
|
|
|
|
"""获取数据源2的一些数据并以字符串的形式收集到content中"""
|
|
|
|
|
content = ''
|
|
|
|
|
for url in self.data_source_2:
|
|
|
|
|
text = self.crawler_2(url)
|
|
|
|
|
if text:
|
|
|
|
|
content = content + self.crawler_2(url)
|
|
|
|
|
return content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'''
|
|
|
|
|
这是对于数据源3的普通做法,按照爬取数据,获取词频,打印结果的步骤依次运行程序。
|
|
|
|
|
'''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class CrawlerDataSource3(Crawler):
|
|
|
|
|
"""包含数据3爬虫相应函数的类"""
|
|
|
|
|
def __init__(self) -> None:
|
|
|
|
|
super().__init__()
|
|
|
|
|
|
|
|
|
|
def crawler_3(self, url: str):
|
|
|
|
|
"""单线程爬取数据3"""
|
|
|
|
|
try:
|
|
|
|
|
response = requests.get(url, headers=self.get_headers())
|
|
|
|
|
text = response.content.decode('utf-8')
|
|
|
|
|
return text
|
|
|
|
|
except Exception as result:
|
|
|
|
|
print(f"发现错误: {result}")
|
|
|
|
|
# time.sleep(3)
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
def get_data_3(self) -> str:
|
|
|
|
|
"""获取数据源3的一些数据并以字符串的形式收集到content中"""
|
|
|
|
|
content = ''
|
|
|
|
|
for url in self.data_source_3:
|
|
|
|
|
text = self.crawler_3(url)
|
|
|
|
|
if text:
|
|
|
|
|
content = content + self.crawler_3(url)
|
|
|
|
|
return content
|
|
|
|
|
|
|
|
|
|
def get_freqs_of_En(self, content: str) -> list[tuple[any, str]]:
|
|
|
|
|
"""获取英文文本的词频"""
|
|
|
|
|
# 获取单词
|
|
|
|
|
pattern = re.compile('[\W_]+')
|
|
|
|
|
word_list = pattern.sub(' ', content).lower()
|
|
|
|
|
word_list = word_list.split()
|
|
|
|
|
|
|
|
|
|
# 过滤停用词
|
|
|
|
|
word_list = [
|
|
|
|
|
w for w in word_list if (w not in self.stop_words) and len(w) >= 3
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
# 统计词频
|
|
|
|
|
word_freqs = {}
|
|
|
|
|
for word in word_list:
|
|
|
|
|
word_freqs[word] = word_freqs.get(word, 0) + 1
|
|
|
|
|
|
|
|
|
|
# 排序
|
|
|
|
|
word_freqs = sorted(word_freqs.items(),
|
|
|
|
|
key=lambda x: x[1],
|
|
|
|
|
reverse=True)
|
|
|
|
|
return word_freqs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
t0 = datetime.now()
|
|
|
|
|
cds2 = CrawlerDataSource2()
|
|
|
|
|
content = cds2.get_data_2()
|
|
|
|
|
word_freqs = cds2.get_freqs(content)
|
|
|
|
|
cds2.print_freqs(word_freqs)
|
|
|
|
|
t1 = datetime.now()
|
|
|
|
|
print(f"数据2耗时:{t1-t0}")
|
|
|
|
|
|
|
|
|
|
t0 = datetime.now()
|
|
|
|
|
cds3 = CrawlerDataSource3()
|
|
|
|
|
content = cds3.get_data_3()
|
|
|
|
|
word_freqs = cds3.get_freqs_of_En(content)
|
|
|
|
|
cds3.print_freqs(word_freqs)
|
|
|
|
|
t1 = datetime.now()
|
|
|
|
|
print(f"数据3耗时:{t1-t0}")
|