ReviewAndAnalyzeOpenSourceS.../src/DjangoBlog/blog/documents.py

# bjy: 导入时间模块
import time

# bjy: 导入Elasticsearch的客户端模块和异常类
import elasticsearch.client
import elasticsearch.exceptions
# bjy: 导入Django的设置
from django.conf import settings
# bjy: 从elasticsearch_dsl中导入文档、内部文档、字段类型和连接管理器
from elasticsearch_dsl import Document, InnerDoc, Date, Integer, Long, Text, Object, GeoPoint, Keyword, Boolean
from elasticsearch_dsl.connections import connections

# bjy: 从blog应用中导入Article模型
from blog.models import Article

# bjy: 检查Django设置中是否配置了ELASTICSEARCH_DSL，以决定是否启用Elasticsearch功能
ELASTICSEARCH_ENABLED = hasattr(settings, 'ELASTICSEARCH_DSL')

# bjy: 如果启用了Elasticsearch
if ELASTICSEARCH_ENABLED:
    # bjy: 根据Django设置创建到Elasticsearch的连接
    connections.create_connection(
        hosts=[settings.ELASTICSEARCH_DSL['default']['hosts']])
    # bjy: 导入并实例化Elasticsearch客户端
    from elasticsearch import Elasticsearch

    es = Elasticsearch(settings.ELASTICSEARCH_DSL['default']['hosts'])
    # bjy: 导入并实例化Ingest客户端，用于管理管道
    from elasticsearch.client import IngestClient

    c = IngestClient(es)
    # bjy: 尝试获取名为'geoip'的管道
    try:
        c.get_pipeline('geoip')
    # bjy: 如果管道不存在，则创建它
    except elasticsearch.exceptions.NotFoundError:
        # bjy: 创建一个geoip管道，用于根据IP地址添加地理位置信息
        c.put_pipeline('geoip', body='''{
              "description" : "Add geoip info",
              "processors" : [
                {
                  "geoip" : {
                    "field" : "ip"
                  }
                }
              ]
            }''')


# bjy: 定义一个内部文档（InnerDoc）结构，用于存储IP地理位置信息
class GeoIp(InnerDoc):
    # bjy: 大洲名称
    continent_name = Keyword()
    # bjy: 国家ISO代码
    country_iso_code = Keyword()
    # bjy: 国家名称
    country_name = Keyword()
    # bjy: 地理坐标（经纬度）
    location = GeoPoint()


# bjy: 定义内部文档，用于存储用户代理（User-Agent）中的浏览器信息
class UserAgentBrowser(InnerDoc):
    # bjy: 浏览器家族（如Chrome, Firefox）
    Family = Keyword()
    # bjy: 浏览器版本
    Version = Keyword()


# bjy: 定义内部文档，用于存储用户代理中的操作系统信息
class UserAgentOS(UserAgentBrowser):
    # bjy: 继承自UserAgentBrowser，结构相同
    pass


# bjy: 定义内部文档，用于存储用户代理中的设备信息
class UserAgentDevice(InnerDoc):
    # bjy: 设备家族（如iPhone, Android）
    Family = Keyword()
    # bjy: 设备品牌（如Apple, Samsung）
    Brand = Keyword()
    # bjy: 设备型号（如iPhone 12）
    Model = Keyword()


# bjy: 定义内部文档，用于存储完整的用户代理信息
class UserAgent(InnerDoc):
    # bjy: 嵌套浏览器信息
    browser = Object(UserAgentBrowser, required=False)
    # bjy: 嵌套操作系统信息
    os = Object(UserAgentOS, required=False)
    # bjy: 嵌套设备信息
    device = Object(UserAgentDevice, required=False)
    # bjy: 原始User-Agent字符串
    string = Text()
    # bjy: 是否为爬虫或机器人
    is_bot = Boolean()


# bjy: 定义一个Elasticsearch文档，用于存储页面性能数据（如响应时间）
class ElapsedTimeDocument(Document):
    # bjy: 请求的URL
    url = Keyword()
    # bjy: 请求耗时（毫秒）
    time_taken = Long()
    # bjy: 日志记录时间
    log_datetime = Date()
    # bjy: 客户端IP地址
    ip = Keyword()
    # bjy: 嵌套的IP地理位置信息
    geoip = Object(GeoIp, required=False)
    # bjy: 嵌套的用户代理信息
    useragent = Object(UserAgent, required=False)

    class Index:
        # bjy: 指定索引名称为'performance'
        name = 'performance'
        # bjy: 设置索引的分片和副本数
        settings = {
            "number_of_shards": 1,
            "number_of_replicas": 0
        }

    class Meta:
        # bjy: 指定文档类型
        doc_type = 'ElapsedTime'


# bjy: 定义一个管理类，用于操作ElapsedTimeDocument索引
class ElaspedTimeDocumentManager:
    @staticmethod
    def build_index():
        # bjy: 如果索引不存在，则创建它
        from elasticsearch import Elasticsearch
        client = Elasticsearch(settings.ELASTICSEARCH_DSL['default']['hosts'])
        res = client.indices.exists(index="performance")
        if not res:
            ElapsedTimeDocument.init()

    @staticmethod
    def delete_index():
        # bjy: 删除'performance'索引
        from elasticsearch import Elasticsearch
        es = Elasticsearch(settings.ELASTICSEARCH_DSL['default']['hosts'])
        es.indices.delete(index='performance', ignore=[400, 404])

    @staticmethod
    def create(url, time_taken, log_datetime, useragent, ip):
        # bjy: 确保索引存在
        ElaspedTimeDocumentManager.build_index()
        # bjy: 构建UserAgent内部文档对象
        ua = UserAgent()
        ua.browser = UserAgentBrowser()
        ua.browser.Family = useragent.browser.family
        ua.browser.Version = useragent.browser.version_string

        ua.os = UserAgentOS()
        ua.os.Family = useragent.os.family
        ua.os.Version = useragent.os.version_string

        ua.device = UserAgentDevice()
        ua.device.Family = useragent.device.family
        ua.device.Brand = useragent.device.brand
        ua.device.Model = useragent.device.model
        ua.string = useragent.ua_string
        ua.is_bot = useragent.is_bot

        # bjy: 创建ElapsedTimeDocument文档实例
        doc = ElapsedTimeDocument(
            meta={
                # bjy: 使用当前时间的毫秒数作为文档ID
                'id': int(
                    round(
                        time.time() *
                        1000))
            },
            url=url,
            time_taken=time_taken,
            log_datetime=log_datetime,
            useragent=ua, ip=ip)
        # bjy: 保存文档，并使用'geoip'管道处理IP地址
        doc.save(pipeline="geoip")


# bjy: 定义一个Elasticsearch文档，用于存储博客文章数据，以支持全文搜索
class ArticleDocument(Document):
    # bjy: 文章内容，使用ik分词器进行索引和搜索
    body = Text(analyzer='ik_max_word', search_analyzer='ik_smart')
    # bjy: 文章标题，使用ik分词器
    title = Text(analyzer='ik_max_word', search_analyzer='ik_smart')
    # bjy: 作者信息，为一个对象类型
    author = Object(properties={
        'nickname': Text(analyzer='ik_max_word', search_analyzer='ik_smart'),
        'id': Integer()
    })
    # bjy: 分类信息，为一个对象类型
    category = Object(properties={
        'name': Text(analyzer='ik_max_word', search_analyzer='ik_smart'),
        'id': Integer()
    })
    # bjy: 标签信息，为一个对象类型
    tags = Object(properties={
        'name': Text(analyzer='ik_max_word', search_analyzer='ik_smart'),
        'id': Integer()
    })

    # bjy: 发布时间
    pub_time = Date()
    # bjy: 文章状态
    status = Text()
    # bjy: 评论状态
    comment_status = Text()
    # bjy: 文章类型
    type = Text()
    # bjy: 浏览量
    views = Integer()
    # bjy: 文章排序权重
    article_order = Integer()

    class Index:
        # bjy: 指定索引名称为'blog'
        name = 'blog'
        # bjy: 设置索引的分片和副本数
        settings = {
            "number_of_shards": 1,
            "number_of_replicas": 0
        }

    class Meta:
        # bjy: 指定文档类型
        doc_type = 'Article'


# bjy: 定义一个管理类，用于操作ArticleDocument索引
class ArticleDocumentManager():

    def __init__(self):
        # bjy: 初始化时创建索引
        self.create_index()

    def create_index(self):
        # bjy: 创建'blog'索引
        ArticleDocument.init()

    def delete_index(self):
        # bjy: 删除'blog'索引
        from elasticsearch import Elasticsearch
        es = Elasticsearch(settings.ELASTICSEARCH_DSL['default']['hosts'])
        es.indices.delete(index='blog', ignore=[400, 404])

    def convert_to_doc(self, articles):
        # bjy: 将Django的Article查询集转换为ArticleDocument对象列表
        return [
            ArticleDocument(
                meta={
                    'id': article.id},
                body=article.body,
                title=article.title,
                author={
                    'nickname': article.author.username,
                    'id': article.author.id},
                category={
                    'name': article.category.name,
                    'id': article.category.id},
                tags=[
                    {
                        'name': t.name,
                        'id': t.id} for t in article.tags.all()],
                pub_time=article.pub_time,
                status=article.status,
                comment_status=article.comment_status,
                type=article.type,
                views=article.views,
                article_order=article.article_order) for article in articles]

    def rebuild(self, articles=None):
        # bjy: 重建索引。如果未提供articles，则使用所有文章
        ArticleDocument.init()
        articles = articles if articles else Article.objects.all()
        docs = self.convert_to_doc(articles)
        # bjy: 遍历并保存每个文档
        for doc in docs:
            doc.save()

    def update_docs(self, docs):
        # bjy: 更新一组文档
        for doc in docs:
            doc.save()