Update 000 普通做法.py

dev
pcz4qfnkl 11 months ago
parent a07a0a59f5
commit b833cc4236

@ -0,0 +1,106 @@
import re
import requests
from util import Crawler
from lxml import etree
from datetime import datetime
'''这是对于数据源2的普通做法按照爬取数据获取词频打印结果的步骤依次运行程序。'''
class CrawlerDataSource2(Crawler):
"""包含数据2爬虫相应函数的类"""
def __init__(self) -> None:
super().__init__()
def crawler_2(self, url: str):
"""单线程爬取数据2"""
try:
response = requests.post(url, headers=self.get_headers())
html = response.content.decode('utf-8')
tree = etree.HTML(html)
text = tree.xpath('/html/body/div/div/div[4]/div[1]/p/text()')[0]
return text
except Exception as result:
print(f"发现错误: {result}")
# time.sleep(3)
return None
def get_data_2(self) -> str:
"""获取数据源2的一些数据并以字符串的形式收集到content中"""
content = ''
for url in self.data_source_2:
text = self.crawler_2(url)
if text:
content = content + self.crawler_2(url)
return content
'''
这是对于数据源3的普通做法按照爬取数据获取词频打印结果的步骤依次运行程序
'''
class CrawlerDataSource3(Crawler):
"""包含数据3爬虫相应函数的类"""
def __init__(self) -> None:
super().__init__()
def crawler_3(self, url: str):
"""单线程爬取数据3"""
try:
response = requests.get(url, headers=self.get_headers())
text = response.content.decode('utf-8')
return text
except Exception as result:
print(f"发现错误: {result}")
# time.sleep(3)
return None
def get_data_3(self) -> str:
"""获取数据源3的一些数据并以字符串的形式收集到content中"""
content = ''
for url in self.data_source_3:
text = self.crawler_3(url)
if text:
content = content + self.crawler_3(url)
return content
def get_freqs_of_En(self, content: str) -> list[tuple[any, str]]:
"""获取英文文本的词频"""
# 获取单词
pattern = re.compile('[\W_]+')
word_list = pattern.sub(' ', content).lower()
word_list = word_list.split()
# 过滤停用词
word_list = [
w for w in word_list if (w not in self.stop_words) and len(w) >= 3
]
# 统计词频
word_freqs = {}
for word in word_list:
word_freqs[word] = word_freqs.get(word, 0) + 1
# 排序
word_freqs = sorted(word_freqs.items(),
key=lambda x: x[1],
reverse=True)
return word_freqs
if __name__ == '__main__':
t0 = datetime.now()
cds2 = CrawlerDataSource2()
content = cds2.get_data_2()
word_freqs = cds2.get_freqs(content)
cds2.print_freqs(word_freqs)
t1 = datetime.now()
print(f"数据2耗时:{t1-t0}")
t0 = datetime.now()
cds3 = CrawlerDataSource3()
content = cds3.get_data_3()
word_freqs = cds3.get_freqs_of_En(content)
cds3.print_freqs(word_freqs)
t1 = datetime.now()
print(f"数据3耗时:{t1-t0}")
Loading…
Cancel
Save