You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

136 lines
6.4 KiB

12 months ago
import random
import string
import jieba
from collections import Counter
import time
12 months ago
12 months ago
###############################################################################
12 months ago
# 本主题通用代码
12 months ago
###############################################################################
class Crawler:
"""爬虫父类,为子类爬虫提供基础变量与函数"""
def __init__(self) -> None:
self.stop_words = self.get_stop_words()
# 一共有三个可以切换的数据源,来自三个不同的网站
self.data_source_1 = [
'https://data.tpdc.ac.cn/zh-hans/data/4a68ebac-571f-4860-82b3-32d397b2ff59',
'https://data.tpdc.ac.cn/zh-hans/data/19838f9a-9672-46d5-bd98-ca56973eb5fa',
'https://data.tpdc.ac.cn/zh-hans/data/39018cd4-a3aa-4888-9a07-7316b52aefbc',
'https://data.tpdc.ac.cn/zh-hans/data/12191fa5-2818-4557-b665-dbae88e557a4',
'https://data.tpdc.ac.cn/zh-hans/data/5296105f-f1b6-4296-b2ee-8ddb0945d017',
'https://data.tpdc.ac.cn/zh-hans/data/5c6b66b6-2699-4664-9024-0bfc4d7b6800',
'https://data.tpdc.ac.cn/zh-hans/data/c4df7e12-ec56-4078-aebb-fb1b142ae622',
'https://data.tpdc.ac.cn/zh-hans/data/27f4e535-16f8-47ae-be83-499cdf61f522',
'https://data.tpdc.ac.cn/zh-hans/data/689ab245-cc98-440d-91d5-9784beb0a675',
'https://data.tpdc.ac.cn/zh-hans/data/dc672fe7-7b00-4626-a9d5-8a0e00d791c3'
]
self.data_source_2 = [
f'https://wen.lzep.cn/wen/186{i}.html' for i in range(100, 130)
]
self.data_source_3 = [
f'https://www.gutenberg.org/cache/epub/73208/pg73{i}.txt'
for i in range(200, 210)
]
def get_stop_words(self) -> list:
"""返回常用停用词列表"""
# 创建停用词列表
stop_words = [
'a', 'able', 'about', 'across', 'after', 'all', 'almost', 'also',
'am', 'among', 'an', 'and', 'any', 'are', 'as', 'at', 'be',
'because', 'been', 'but', 'by', 'can', 'cannot', 'could', 'dear',
'did', 'do', 'does', 'either', 'else', 'ever', 'every', 'for',
'from', 'get', 'got', 'had', 'has', 'have', 'he', 'her', 'hers',
'him', 'his', 'how', 'however', 'i', 'if', 'in', 'into', 'is',
'it', 'its', 'just', 'least', 'let', 'like', 'likely', 'may', 'me',
'might', 'most', 'must', 'my', 'neither', 'no', 'nor', 'not', 'of',
'off', 'often', 'on', 'only', 'or', 'other', 'our', 'own',
'rather', 'said', 'say', 'says', 'she', 'should', 'since', 'so',
'some', 'than', 'that', 'the', 'their', 'them', 'then', 'there',
'these', 'they', 'this', 'tis', 'to', 'too', 'twas', 'us', 'wants',
'was', 'we', 'were', 'what', 'when', 'where', 'which', 'while',
'who', 'whom', 'why', 'will', 'with', 'would', 'yet', 'you',
'your', '', '', '一些', '', '它们', '这个', '用于', '包含', '用于', '示例'
]
# 将小写字母添加到停用词列表中
stop_words.extend(list(string.ascii_lowercase))
return stop_words
def get_headers(self) -> dict:
"""返回标头"""
agent_list = [
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:87.0) Gecko/20100101 \
Firefox/87.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, li\
ke Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0",
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHT\
ML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
"Mozilla/5.0 (Linux; Android 7.0; SM-G950U Build/NRD90M) AppleWebK\
it/537.36 (KHTML, like Gecko) Chrome/62.0.3202.84 Mobile Safari/53\
7.36",
"Mozilla/5.0 (Linux; Android 8.0.0; SM-G965U Build/R16NW) AppleWeb\
Kit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.111 Mobile Safari/\
537.36",
"Mozilla/5.0 (Linux; Android 8.1.0; SM-T837A) AppleWebKit/537.36 (\
KHTML, like Gecko) Chrome/70.0.3538.80 Safari/537.36",
"Mozilla/5.0 (Linux; U; en-us; KFAPWI Build/JDQ39) AppleWebKit/535\
.19 (KHTML, like Gecko) Silk/3.13 Safari/535.19 Silk-Accelerated=t\
rue",
"Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia \
550) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Mob\
ile Safari/537.36 Edge/14.14263",
"Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia \
950) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Mob\
ile Safari/537.36 Edge/14.14263",
"Mozilla/5.0 (Linux; Android 11; moto g power (2022)) AppleWebKit/\
537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36",
"Mozilla/5.0 (Linux; Android 6.0.1; Moto G (4)) AppleWebKit/537.36\
(KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36",
"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 10 Build/MOB31T) AppleWe\
bKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
"Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWeb\
Kit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.\
36",
"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKi\
t/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36\
",
"Mozilla/5.0 (Linux; Android 8.0.0; Nexus 5X Build/OPR4.170623.006\
) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile S\
afari/537.36",
]
headers = {
"user_agent": random.choice(agent_list),
}
return headers
def get_freqs(self, content: str) -> list[tuple[any, int]]:
"""获取中文文本content的词频"""
# 分词
words = jieba.cut(content)
word_freqs = Counter()
for word in words:
if word not in self.stop_words:
word_freqs[word] += 1
word_freqs = word_freqs.most_common(10)
return word_freqs
def print_freqs(self, word_freqs: list[tuple[any, int]]) -> None:
"""打印词频"""
cnt = 1
for keyword, freq in word_freqs:
if len(keyword) > 1:
print(f"{keyword}: {freq}")
cnt += 1
if cnt > 10:
return