import random
import string
import jieba
from collections import Counter
import time
# 本主题通用代码
class Crawler:
def __init__(self) -> None:
self.stop_words = self.get_stop_words()
# 一共有三个可以切换的数据源,来自三个不同的网站
self.data_source_1 = [
self.data_source_2 = [
f'{i}.html' for i in range(100, 130)
self.data_source_3 = [
for i in range(200, 210)
def get_stop_words(self) -> list:
# 创建停用词列表
stop_words = [
'a', 'able', 'about', 'across', 'after', 'all', 'almost', 'also',
'am', 'among', 'an', 'and', 'any', 'are', 'as', 'at', 'be',
'because', 'been', 'but', 'by', 'can', 'cannot', 'could', 'dear',
'did', 'do', 'does', 'either', 'else', 'ever', 'every', 'for',
'from', 'get', 'got', 'had', 'has', 'have', 'he', 'her', 'hers',
'him', 'his', 'how', 'however', 'i', 'if', 'in', 'into', 'is',
'it', 'its', 'just', 'least', 'let', 'like', 'likely', 'may', 'me',
'might', 'most', 'must', 'my', 'neither', 'no', 'nor', 'not', 'of',
'off', 'often', 'on', 'only', 'or', 'other', 'our', 'own',
'rather', 'said', 'say', 'says', 'she', 'should', 'since', 'so',
'some', 'than', 'that', 'the', 'their', 'them', 'then', 'there',
'these', 'they', 'this', 'tis', 'to', 'too', 'twas', 'us', 'wants',
'was', 'we', 'were', 'what', 'when', 'where', 'which', 'while',
'who', 'whom', 'why', 'will', 'with', 'would', 'yet', 'you',
'your', '', '', '一些', '', '它们', '这个', '用于', '包含', '用于', '示例'
# 将小写字母添加到停用词列表中
return stop_words
def get_headers(self) -> dict:
agent_list = [
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:87.0) Gecko/20100101 \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, li\
ke Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0",
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHT\
ML, like Gecko) Chrome/ Safari/537.36 Edg/',
"Mozilla/5.0 (Linux; Android 7.0; SM-G950U Build/NRD90M) AppleWebK\
it/537.36 (KHTML, like Gecko) Chrome/62.0.3202.84 Mobile Safari/53\
"Mozilla/5.0 (Linux; Android 8.0.0; SM-G965U Build/R16NW) AppleWeb\
Kit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.111 Mobile Safari/\
"Mozilla/5.0 (Linux; Android 8.1.0; SM-T837A) AppleWebKit/537.36 (\
KHTML, like Gecko) Chrome/70.0.3538.80 Safari/537.36",
"Mozilla/5.0 (Linux; U; en-us; KFAPWI Build/JDQ39) AppleWebKit/535\
.19 (KHTML, like Gecko) Silk/3.13 Safari/535.19 Silk-Accelerated=t\
"Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia \
550) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Mob\
ile Safari/537.36 Edge/14.14263",
"Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia \
950) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Mob\
ile Safari/537.36 Edge/14.14263",
"Mozilla/5.0 (Linux; Android 11; moto g power (2022)) AppleWebKit/\
537.36 (KHTML, like Gecko) Chrome/ Mobile Safari/537.36",
"Mozilla/5.0 (Linux; Android 6.0.1; Moto G (4)) AppleWebKit/537.36\
(KHTML, like Gecko) Chrome/ Mobile Safari/537.36",
"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 10 Build/MOB31T) AppleWe\
bKit/537.36 (KHTML, like Gecko) Chrome/ Safari/537.36",
"Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWeb\
Kit/537.36 (KHTML, like Gecko) Chrome/ Mobile Safari/537.\
"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKi\
t/537.36 (KHTML, like Gecko) Chrome/ Mobile Safari/537.36\
"Mozilla/5.0 (Linux; Android 8.0.0; Nexus 5X Build/OPR4.170623.006\
) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/ Mobile S\
headers = {
"user_agent": random.choice(agent_list),
return headers
def get_freqs(self, content: str) -> list[tuple[any, int]]:
# 分词
words = jieba.cut(content)
word_freqs = Counter()
for word in words:
if word not in self.stop_words:
word_freqs[word] += 1
word_freqs = word_freqs.most_common(10)
return word_freqs
def print_freqs(self, word_freqs: list[tuple[any, int]]) -> None:
cnt = 1
for keyword, freq in word_freqs:
if len(keyword) > 1:
print(f"{keyword}: {freq}")
cnt += 1
if cnt > 10: