From 5bae016d7a6e370879231699a51b728422762c65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=B8=A5=E6=AC=A3=E6=80=A1?= <3093609022@qq.com> Date: Sun, 19 Oct 2025 15:37:55 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=BAdocuments.py=E6=B7=BB=E5=8A=A0?= =?UTF-8?q?=E4=BB=A3=E7=A0=81=E6=B3=A8=E9=87=8A=EF=BC=8C=E5=AE=8C=E5=96=84?= =?UTF-8?q?=E9=A1=B9=E7=9B=AE=E6=96=87=E6=A1=A3=E5=8F=AF=E8=AF=BB=E6=80=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../DjangoBlog-master/blog/documents.py | 208 +++++++++++------- 1 file changed, 131 insertions(+), 77 deletions(-) diff --git a/djangoblog/src/DjangoBlog-master/DjangoBlog-master/blog/documents.py b/djangoblog/src/DjangoBlog-master/DjangoBlog-master/blog/documents.py index 0f1db7b7..c9ba1285 100644 --- a/djangoblog/src/DjangoBlog-master/DjangoBlog-master/blog/documents.py +++ b/djangoblog/src/DjangoBlog-master/DjangoBlog-master/blog/documents.py @@ -1,213 +1,267 @@ +# 导入时间处理模块 import time +# 导入Elasticsearch客户端相关模块 import elasticsearch.client +# 导入Django配置模块 from django.conf import settings +# 导入Elasticsearch DSL相关组件,用于定义文档结构 from elasticsearch_dsl import Document, InnerDoc, Date, Integer, Long, Text, Object, GeoPoint, Keyword, Boolean from elasticsearch_dsl.connections import connections +# 导入博客文章模型,用于数据同步 from blog.models import Article +# 检查是否启用Elasticsearch(通过判断配置中是否存在ELASTICSEARCH_DSL) ELASTICSEARCH_ENABLED = hasattr(settings, 'ELASTICSEARCH_DSL') +# 如果启用了Elasticsearch,则进行初始化配置 if ELASTICSEARCH_ENABLED: + # 创建Elasticsearch连接(从Django配置中获取主机地址) connections.create_connection( hosts=[settings.ELASTICSEARCH_DSL['default']['hosts']]) + # 导入Elasticsearch客户端并初始化 from elasticsearch import Elasticsearch es = Elasticsearch(settings.ELASTICSEARCH_DSL['default']['hosts']) + + # 初始化IngestClient(用于处理数据预处理管道) from elasticsearch.client import IngestClient c = IngestClient(es) + + # 尝试获取名为'geoip'的管道,如果不存在则创建 try: c.get_pipeline('geoip') except elasticsearch.exceptions.NotFoundError: + # 创建geoip管道:通过ip地址解析地理位置信息 c.put_pipeline('geoip', body='''{ - "description" : "Add geoip info", + "description" : "Add geoip info", # 管道描述:添加地理信息 "processors" : [ { "geoip" : { - "field" : "ip" + "field" : "ip" # 基于ip字段解析地理信息 } } ] }''') +# 定义地理位置信息内部文档(嵌套在主文档中) class GeoIp(InnerDoc): - continent_name = Keyword() - country_iso_code = Keyword() - country_name = Keyword() - location = GeoPoint() + continent_name = Keyword() # 大洲名称( Keyword类型:不分词,适合精确查询) + country_iso_code = Keyword() # 国家ISO代码(如CN、US) + country_name = Keyword() # 国家名称 + location = GeoPoint() # 经纬度坐标(Elasticsearch地理点类型) +# 定义用户代理浏览器信息内部文档 class UserAgentBrowser(InnerDoc): - Family = Keyword() - Version = Keyword() + Family = Keyword() # 浏览器家族(如Chrome、Firefox) + Version = Keyword() # 浏览器版本 +# 定义用户代理操作系统信息内部文档(继承浏览器结构,字段相同) class UserAgentOS(UserAgentBrowser): pass +# 定义用户代理设备信息内部文档 class UserAgentDevice(InnerDoc): - Family = Keyword() - Brand = Keyword() - Model = Keyword() + Family = Keyword() # 设备家族(如iPhone、Windows) + Brand = Keyword() # 设备品牌(如Apple、Samsung) + Model = Keyword() # 设备型号(如iPhone 13) +# 定义用户代理整体信息内部文档(整合浏览器、系统、设备信息) class UserAgent(InnerDoc): - browser = Object(UserAgentBrowser, required=False) - os = Object(UserAgentOS, required=False) - device = Object(UserAgentDevice, required=False) - string = Text() - is_bot = Boolean() + browser = Object(UserAgentBrowser, required=False) # 浏览器信息(可选) + os = Object(UserAgentOS, required=False) # 操作系统信息(可选) + device = Object(UserAgentDevice, required=False) # 设备信息(可选) + string = Text() # 原始用户代理字符串(如"Mozilla/5.0...") + is_bot = Boolean() # 是否为爬虫机器人 +# 定义性能日志文档(记录访问性能数据) class ElapsedTimeDocument(Document): - url = Keyword() - time_taken = Long() - log_datetime = Date() - ip = Keyword() - geoip = Object(GeoIp, required=False) - useragent = Object(UserAgent, required=False) - + url = Keyword() # 访问的URL(精确匹配) + time_taken = Long() # 页面加载耗时(毫秒) + log_datetime = Date() # 日志记录时间 + ip = Keyword() # 访问者IP地址 + geoip = Object(GeoIp, required=False) # 地理位置信息(由geoip管道生成) + useragent = Object(UserAgent, required=False) # 用户代理信息 + + # 索引配置 class Index: - name = 'performance' + name = 'performance' # 索引名称:performance(性能日志) settings = { - "number_of_shards": 1, - "number_of_replicas": 0 + "number_of_shards": 1, # 主分片数量 + "number_of_replicas": 0 # 副本分片数量(单节点环境设为0) } + # 文档类型配置(Elasticsearch 7+后逐渐废弃,但DSL仍保留兼容) class Meta: doc_type = 'ElapsedTime' +# 性能日志文档管理器(处理索引创建、删除、数据写入) class ElaspedTimeDocumentManager: @staticmethod def build_index(): + """创建performance索引(如果不存在)""" from elasticsearch import Elasticsearch + # 连接Elasticsearch client = Elasticsearch(settings.ELASTICSEARCH_DSL['default']['hosts']) + # 检查索引是否存在 res = client.indices.exists(index="performance") if not res: + # 初始化索引(根据ElapsedTimeDocument的定义创建映射) ElapsedTimeDocument.init() @staticmethod def delete_index(): + """删除performance索引""" from elasticsearch import Elasticsearch es = Elasticsearch(settings.ELASTICSEARCH_DSL['default']['hosts']) + # 忽略400(索引不存在)和404(请求错误) es.indices.delete(index='performance', ignore=[400, 404]) @staticmethod def create(url, time_taken, log_datetime, useragent, ip): + """创建一条性能日志记录并写入Elasticsearch""" + # 确保索引存在 ElaspedTimeDocumentManager.build_index() + + # 构建用户代理信息对象 ua = UserAgent() ua.browser = UserAgentBrowser() - ua.browser.Family = useragent.browser.family - ua.browser.Version = useragent.browser.version_string + ua.browser.Family = useragent.browser.family # 浏览器家族 + ua.browser.Version = useragent.browser.version_string # 浏览器版本 ua.os = UserAgentOS() - ua.os.Family = useragent.os.family - ua.os.Version = useragent.os.version_string + ua.os.Family = useragent.os.family # 操作系统家族 + ua.os.Version = useragent.os.version_string # 操作系统版本 ua.device = UserAgentDevice() - ua.device.Family = useragent.device.family - ua.device.Brand = useragent.device.brand - ua.device.Model = useragent.device.model - ua.string = useragent.ua_string - ua.is_bot = useragent.is_bot + ua.device.Family = useragent.device.family # 设备家族 + ua.device.Brand = useragent.device.brand # 设备品牌 + ua.device.Model = useragent.device.model # 设备型号 + ua.string = useragent.ua_string # 原始用户代理字符串 + ua.is_bot = useragent.is_bot # 是否为爬虫 + # 构建性能日志文档 doc = ElapsedTimeDocument( meta={ - 'id': int( - round( - time.time() * - 1000)) + # 用当前时间戳(毫秒)作为文档ID + 'id': int(round(time.time() * 1000)) }, - url=url, - time_taken=time_taken, - log_datetime=log_datetime, - useragent=ua, ip=ip) + url=url, # 访问URL + time_taken=time_taken, # 耗时 + log_datetime=log_datetime, # 日志时间 + useragent=ua, # 用户代理信息 + ip=ip # IP地址 + ) + # 保存文档时应用geoip管道(自动解析IP对应的地理位置) doc.save(pipeline="geoip") +# 定义文章文档(用于博客文章的搜索索引) class ArticleDocument(Document): + # 文章内容(使用ik分词器:max_word最大化分词,smart智能分词) body = Text(analyzer='ik_max_word', search_analyzer='ik_smart') + # 文章标题(同上分词配置) title = Text(analyzer='ik_max_word', search_analyzer='ik_smart') + # 作者信息(嵌套对象) author = Object(properties={ - 'nickname': Text(analyzer='ik_max_word', search_analyzer='ik_smart'), - 'id': Integer() + 'nickname': Text(analyzer='ik_max_word', search_analyzer='ik_smart'), # 作者昵称 + 'id': Integer() # 作者ID }) + # 分类信息(嵌套对象) category = Object(properties={ - 'name': Text(analyzer='ik_max_word', search_analyzer='ik_smart'), - 'id': Integer() + 'name': Text(analyzer='ik_max_word', search_analyzer='ik_smart'), # 分类名称 + 'id': Integer() # 分类ID }) + # 标签信息(嵌套对象列表) tags = Object(properties={ - 'name': Text(analyzer='ik_max_word', search_analyzer='ik_smart'), - 'id': Integer() + 'name': Text(analyzer='ik_max_word', search_analyzer='ik_smart'), # 标签名称 + 'id': Integer() # 标签ID }) - - pub_time = Date() - status = Text() - comment_status = Text() - type = Text() - views = Integer() - article_order = Integer() - + pub_time = Date() # 发布时间 + status = Text() # 文章状态(如发布、草稿) + comment_status = Text() # 评论状态(如开启、关闭) + type = Text() # 文章类型(如原创、转载) + views = Integer() # 浏览量 + article_order = Integer() # 文章排序权重 + + # 索引配置 class Index: - name = 'blog' + name = 'blog' # 索引名称:blog(博客文章) settings = { "number_of_shards": 1, "number_of_replicas": 0 } + # 文档类型配置 class Meta: doc_type = 'Article' +# 文章文档管理器(处理文章索引的创建、更新、重建) class ArticleDocumentManager(): def __init__(self): + """初始化时创建索引(如果不存在)""" self.create_index() def create_index(self): + """创建blog索引(根据ArticleDocument定义初始化映射)""" ArticleDocument.init() def delete_index(self): + """删除blog索引""" from elasticsearch import Elasticsearch es = Elasticsearch(settings.ELASTICSEARCH_DSL['default']['hosts']) es.indices.delete(index='blog', ignore=[400, 404]) def convert_to_doc(self, articles): + """将Django模型对象列表转换为ArticleDocument列表""" return [ ArticleDocument( - meta={ - 'id': article.id}, - body=article.body, - title=article.title, + meta={'id': article.id}, # 用文章ID作为文档ID + body=article.body, # 文章内容 + title=article.title, # 文章标题 author={ - 'nickname': article.author.username, - 'id': article.author.id}, + 'nickname': article.author.username, # 作者用户名 + 'id': article.author.id # 作者ID + }, category={ - 'name': article.category.name, - 'id': article.category.id}, - tags=[ - { - 'name': t.name, - 'id': t.id} for t in article.tags.all()], - pub_time=article.pub_time, - status=article.status, - comment_status=article.comment_status, - type=article.type, - views=article.views, - article_order=article.article_order) for article in articles] + 'name': article.category.name, # 分类名称 + 'id': article.category.id # 分类ID + }, + # 转换标签列表(多对多关系) + tags=[{'name': t.name, 'id': t.id} for t in article.tags.all()], + pub_time=article.pub_time, # 发布时间 + status=article.status, # 文章状态 + comment_status=article.comment_status, # 评论状态 + type=article.type, # 文章类型 + views=article.views, # 浏览量 + article_order=article.article_order # 排序权重 + ) for article in articles + ] def rebuild(self, articles=None): + """重建索引(默认同步所有文章,可指定文章列表)""" + # 初始化索引结构 ArticleDocument.init() + # 如果未指定文章,则同步所有文章 articles = articles if articles else Article.objects.all() + # 转换模型为文档对象 docs = self.convert_to_doc(articles) + # 批量保存文档 for doc in docs: doc.save() def update_docs(self, docs): + """更新文档列表(批量保存)""" for doc in docs: - doc.save() + doc.save() \ No newline at end of file