CodePattern/20 高性能模式/util.py

import random
import string
import jieba
from collections import Counter
import time

###############################################################################
#  本主题通用代码
###############################################################################


class Crawler:
    """爬虫父类,为子类爬虫提供基础变量与函数"""
    def __init__(self) -> None:

        self.stop_words = self.get_stop_words()

        # 一共有三个可以切换的数据源，来自三个不同的网站
        self.data_source_1 = [
            'https://data.tpdc.ac.cn/zh-hans/data/4a68ebac-571f-4860-82b3-32d397b2ff59',
            'https://data.tpdc.ac.cn/zh-hans/data/19838f9a-9672-46d5-bd98-ca56973eb5fa',
            'https://data.tpdc.ac.cn/zh-hans/data/39018cd4-a3aa-4888-9a07-7316b52aefbc',
            'https://data.tpdc.ac.cn/zh-hans/data/12191fa5-2818-4557-b665-dbae88e557a4',
            'https://data.tpdc.ac.cn/zh-hans/data/5296105f-f1b6-4296-b2ee-8ddb0945d017',
            'https://data.tpdc.ac.cn/zh-hans/data/5c6b66b6-2699-4664-9024-0bfc4d7b6800',
            'https://data.tpdc.ac.cn/zh-hans/data/c4df7e12-ec56-4078-aebb-fb1b142ae622',
            'https://data.tpdc.ac.cn/zh-hans/data/27f4e535-16f8-47ae-be83-499cdf61f522',
            'https://data.tpdc.ac.cn/zh-hans/data/689ab245-cc98-440d-91d5-9784beb0a675',
            'https://data.tpdc.ac.cn/zh-hans/data/dc672fe7-7b00-4626-a9d5-8a0e00d791c3'
        ]

        self.data_source_2 = [
            f'https://wen.lzep.cn/wen/186{i}.html' for i in range(100, 130)
        ]

        self.data_source_3 = [
            f'https://www.gutenberg.org/cache/epub/73208/pg73{i}.txt'
            for i in range(200, 210)
        ]

    def get_stop_words(self) -> list:
        """返回常用停用词列表"""
        # 创建停用词列表
        stop_words = [
            'a', 'able', 'about', 'across', 'after', 'all', 'almost', 'also',
            'am', 'among', 'an', 'and', 'any', 'are', 'as', 'at', 'be',
            'because', 'been', 'but', 'by', 'can', 'cannot', 'could', 'dear',
            'did', 'do', 'does', 'either', 'else', 'ever', 'every', 'for',
            'from', 'get', 'got', 'had', 'has', 'have', 'he', 'her', 'hers',
            'him', 'his', 'how', 'however', 'i', 'if', 'in', 'into', 'is',
            'it', 'its', 'just', 'least', 'let', 'like', 'likely', 'may', 'me',
            'might', 'most', 'must', 'my', 'neither', 'no', 'nor', 'not', 'of',
            'off', 'often', 'on', 'only', 'or', 'other', 'our', 'own',
            'rather', 'said', 'say', 'says', 'she', 'should', 'since', 'so',
            'some', 'than', 'that', 'the', 'their', 'them', 'then', 'there',
            'these', 'they', 'this', 'tis', 'to', 'too', 'twas', 'us', 'wants',
            'was', 'we', 'were', 'what', 'when', 'where', 'which', 'while',
            'who', 'whom', 'why', 'will', 'with', 'would', 'yet', 'you',
            'your', '的', '是', '一些', '将', '它们', '这个', '用于', '包含', '用于', '示例'
        ]

        # 将小写字母添加到停用词列表中
        stop_words.extend(list(string.ascii_lowercase))

        return stop_words

    def get_headers(self) -> dict:
        """返回标头"""
        agent_list = [
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:87.0) Gecko/20100101 \
            Firefox/87.0",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, li\
            ke Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0",
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHT\
            ML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
            "Mozilla/5.0 (Linux; Android 7.0; SM-G950U Build/NRD90M) AppleWebK\
            it/537.36 (KHTML, like Gecko) Chrome/62.0.3202.84 Mobile Safari/53\
            7.36",
            "Mozilla/5.0 (Linux; Android 8.0.0; SM-G965U Build/R16NW) AppleWeb\
            Kit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.111 Mobile Safari/\
            537.36",
            "Mozilla/5.0 (Linux; Android 8.1.0; SM-T837A) AppleWebKit/537.36 (\
            KHTML, like Gecko) Chrome/70.0.3538.80 Safari/537.36",
            "Mozilla/5.0 (Linux; U; en-us; KFAPWI Build/JDQ39) AppleWebKit/535\
            .19 (KHTML, like Gecko) Silk/3.13 Safari/535.19 Silk-Accelerated=t\
            rue",
            "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia \
            550) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Mob\
            ile Safari/537.36 Edge/14.14263",
            "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia \
            950) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Mob\
            ile Safari/537.36 Edge/14.14263",
            "Mozilla/5.0 (Linux; Android 11; moto g power (2022)) AppleWebKit/\
            537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36",
            "Mozilla/5.0 (Linux; Android 6.0.1; Moto G (4)) AppleWebKit/537.36\
            (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36",
            "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 10 Build/MOB31T) AppleWe\
            bKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWeb\
            Kit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.\
            36",
            "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKi\
            t/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36\
            ",
            "Mozilla/5.0 (Linux; Android 8.0.0; Nexus 5X Build/OPR4.170623.006\
            ) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile S\
            afari/537.36",
        ]
        headers = {
            "user_agent": random.choice(agent_list),
        }
        return headers

    def get_freqs(self, content: str) -> list[tuple[any, int]]:
        """获取中文文本content的词频"""
        # 分词
        words = jieba.cut(content)

        word_freqs = Counter()
        for word in words:
            if word not in self.stop_words:
                word_freqs[word] += 1

        word_freqs = word_freqs.most_common(10)
        return word_freqs

    def print_freqs(self, word_freqs: list[tuple[any, int]]) -> None:
        """打印词频"""
        cnt = 1
        for keyword, freq in word_freqs:
            if len(keyword) > 1:
                print(f"{keyword}: {freq}")
            cnt += 1
            if cnt > 10:
                return
Update util.py 12 months ago			`import random`
			`import string`
			`import jieba`
			`from collections import Counter`
			`import time`
调整 12 months ago
Update util.py 12 months ago			`###############################################################################`
调整 12 months ago			`# 本主题通用代码`
Update util.py 12 months ago			`###############################################################################`


			`class Crawler:`
			`"""爬虫父类,为子类爬虫提供基础变量与函数"""`
			`def __init__(self) -> None:`

			`self.stop_words = self.get_stop_words()`

			`# 一共有三个可以切换的数据源，来自三个不同的网站`
			`self.data_source_1 = [`
			`'https://data.tpdc.ac.cn/zh-hans/data/4a68ebac-571f-4860-82b3-32d397b2ff59',`
			`'https://data.tpdc.ac.cn/zh-hans/data/19838f9a-9672-46d5-bd98-ca56973eb5fa',`
			`'https://data.tpdc.ac.cn/zh-hans/data/39018cd4-a3aa-4888-9a07-7316b52aefbc',`
			`'https://data.tpdc.ac.cn/zh-hans/data/12191fa5-2818-4557-b665-dbae88e557a4',`
			`'https://data.tpdc.ac.cn/zh-hans/data/5296105f-f1b6-4296-b2ee-8ddb0945d017',`
			`'https://data.tpdc.ac.cn/zh-hans/data/5c6b66b6-2699-4664-9024-0bfc4d7b6800',`
			`'https://data.tpdc.ac.cn/zh-hans/data/c4df7e12-ec56-4078-aebb-fb1b142ae622',`
			`'https://data.tpdc.ac.cn/zh-hans/data/27f4e535-16f8-47ae-be83-499cdf61f522',`
			`'https://data.tpdc.ac.cn/zh-hans/data/689ab245-cc98-440d-91d5-9784beb0a675',`
			`'https://data.tpdc.ac.cn/zh-hans/data/dc672fe7-7b00-4626-a9d5-8a0e00d791c3'`
			`]`

			`self.data_source_2 = [`
			`f'https://wen.lzep.cn/wen/186{i}.html' for i in range(100, 130)`
			`]`

			`self.data_source_3 = [`
			`f'https://www.gutenberg.org/cache/epub/73208/pg73{i}.txt'`
			`for i in range(200, 210)`
			`]`

			`def get_stop_words(self) -> list:`
			`"""返回常用停用词列表"""`
			`# 创建停用词列表`
			`stop_words = [`
			`'a', 'able', 'about', 'across', 'after', 'all', 'almost', 'also',`
			`'am', 'among', 'an', 'and', 'any', 'are', 'as', 'at', 'be',`
			`'because', 'been', 'but', 'by', 'can', 'cannot', 'could', 'dear',`
			`'did', 'do', 'does', 'either', 'else', 'ever', 'every', 'for',`
			`'from', 'get', 'got', 'had', 'has', 'have', 'he', 'her', 'hers',`
			`'him', 'his', 'how', 'however', 'i', 'if', 'in', 'into', 'is',`
			`'it', 'its', 'just', 'least', 'let', 'like', 'likely', 'may', 'me',`
			`'might', 'most', 'must', 'my', 'neither', 'no', 'nor', 'not', 'of',`
			`'off', 'often', 'on', 'only', 'or', 'other', 'our', 'own',`
			`'rather', 'said', 'say', 'says', 'she', 'should', 'since', 'so',`
			`'some', 'than', 'that', 'the', 'their', 'them', 'then', 'there',`
			`'these', 'they', 'this', 'tis', 'to', 'too', 'twas', 'us', 'wants',`
			`'was', 'we', 'were', 'what', 'when', 'where', 'which', 'while',`
			`'who', 'whom', 'why', 'will', 'with', 'would', 'yet', 'you',`
			`'your', '的', '是', '一些', '将', '它们', '这个', '用于', '包含', '用于', '示例'`
			`]`

			`# 将小写字母添加到停用词列表中`
			`stop_words.extend(list(string.ascii_lowercase))`

			`return stop_words`

			`def get_headers(self) -> dict:`
			`"""返回标头"""`
			`agent_list = [`
			`"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:87.0) Gecko/20100101 \`
			`Firefox/87.0",`
			`"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, li\`
			`ke Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0",`
			`'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHT\`
			`ML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',`
			`"Mozilla/5.0 (Linux; Android 7.0; SM-G950U Build/NRD90M) AppleWebK\`
			`it/537.36 (KHTML, like Gecko) Chrome/62.0.3202.84 Mobile Safari/53\`
			`7.36",`
			`"Mozilla/5.0 (Linux; Android 8.0.0; SM-G965U Build/R16NW) AppleWeb\`
			`Kit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.111 Mobile Safari/\`
			`537.36",`
			`"Mozilla/5.0 (Linux; Android 8.1.0; SM-T837A) AppleWebKit/537.36 (\`
			`KHTML, like Gecko) Chrome/70.0.3538.80 Safari/537.36",`
			`"Mozilla/5.0 (Linux; U; en-us; KFAPWI Build/JDQ39) AppleWebKit/535\`
			`.19 (KHTML, like Gecko) Silk/3.13 Safari/535.19 Silk-Accelerated=t\`
			`rue",`
			`"Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia \`
			`550) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Mob\`
			`ile Safari/537.36 Edge/14.14263",`
			`"Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia \`
			`950) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Mob\`
			`ile Safari/537.36 Edge/14.14263",`
			`"Mozilla/5.0 (Linux; Android 11; moto g power (2022)) AppleWebKit/\`
			`537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36",`
			`"Mozilla/5.0 (Linux; Android 6.0.1; Moto G (4)) AppleWebKit/537.36\`
			`(KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36",`
			`"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 10 Build/MOB31T) AppleWe\`
			`bKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",`
			`"Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWeb\`
			`Kit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.\`
			`36",`
			`"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKi\`
			`t/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36\`
			`",`
			`"Mozilla/5.0 (Linux; Android 8.0.0; Nexus 5X Build/OPR4.170623.006\`
			`) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile S\`
			`afari/537.36",`
			`]`
			`headers = {`
			`"user_agent": random.choice(agent_list),`
			`}`
			`return headers`

			`def get_freqs(self, content: str) -> list[tuple[any, int]]:`
			`"""获取中文文本content的词频"""`
			`# 分词`
			`words = jieba.cut(content)`

			`word_freqs = Counter()`
			`for word in words:`
			`if word not in self.stop_words:`
			`word_freqs[word] += 1`

			`word_freqs = word_freqs.most_common(10)`
			`return word_freqs`

			`def print_freqs(self, word_freqs: list[tuple[any, int]]) -> None:`
			`"""打印词频"""`
			`cnt = 1`
			`for keyword, freq in word_freqs:`
			`if len(keyword) > 1:`
			`print(f"{keyword}: {freq}")`
			`cnt += 1`
			`if cnt > 10:`
			`return`