You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ReviewAndAnalyzeOpenSourceS.../src/DjangoBlog/blog/documents.py

289 lines
9.5 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# bjy: 导入时间模块
import time
# bjy: 导入Elasticsearch的客户端模块和异常类
import elasticsearch.client
import elasticsearch.exceptions
# bjy: 导入Django的设置
from django.conf import settings
# bjy: 从elasticsearch_dsl中导入文档、内部文档、字段类型和连接管理器
from elasticsearch_dsl import Document, InnerDoc, Date, Integer, Long, Text, Object, GeoPoint, Keyword, Boolean
from elasticsearch_dsl.connections import connections
# bjy: 从blog应用中导入Article模型
from blog.models import Article
# bjy: 检查Django设置中是否配置了ELASTICSEARCH_DSL以决定是否启用Elasticsearch功能
ELASTICSEARCH_ENABLED = hasattr(settings, 'ELASTICSEARCH_DSL')
# bjy: 如果启用了Elasticsearch
if ELASTICSEARCH_ENABLED:
# bjy: 根据Django设置创建到Elasticsearch的连接
connections.create_connection(
hosts=[settings.ELASTICSEARCH_DSL['default']['hosts']])
# bjy: 导入并实例化Elasticsearch客户端
from elasticsearch import Elasticsearch
es = Elasticsearch(settings.ELASTICSEARCH_DSL['default']['hosts'])
# bjy: 导入并实例化Ingest客户端用于管理管道
from elasticsearch.client import IngestClient
c = IngestClient(es)
# bjy: 尝试获取名为'geoip'的管道
try:
c.get_pipeline('geoip')
# bjy: 如果管道不存在,则创建它
except elasticsearch.exceptions.NotFoundError:
# bjy: 创建一个geoip管道用于根据IP地址添加地理位置信息
c.put_pipeline('geoip', body='''{
"description" : "Add geoip info",
"processors" : [
{
"geoip" : {
"field" : "ip"
}
}
]
}''')
# bjy: 定义一个内部文档InnerDoc结构用于存储IP地理位置信息
class GeoIp(InnerDoc):
# bjy: 大洲名称
continent_name = Keyword()
# bjy: 国家ISO代码
country_iso_code = Keyword()
# bjy: 国家名称
country_name = Keyword()
# bjy: 地理坐标(经纬度)
location = GeoPoint()
# bjy: 定义内部文档用于存储用户代理User-Agent中的浏览器信息
class UserAgentBrowser(InnerDoc):
# bjy: 浏览器家族如Chrome, Firefox
Family = Keyword()
# bjy: 浏览器版本
Version = Keyword()
# bjy: 定义内部文档,用于存储用户代理中的操作系统信息
class UserAgentOS(UserAgentBrowser):
# bjy: 继承自UserAgentBrowser结构相同
pass
# bjy: 定义内部文档,用于存储用户代理中的设备信息
class UserAgentDevice(InnerDoc):
# bjy: 设备家族如iPhone, Android
Family = Keyword()
# bjy: 设备品牌如Apple, Samsung
Brand = Keyword()
# bjy: 设备型号如iPhone 12
Model = Keyword()
# bjy: 定义内部文档,用于存储完整的用户代理信息
class UserAgent(InnerDoc):
# bjy: 嵌套浏览器信息
browser = Object(UserAgentBrowser, required=False)
# bjy: 嵌套操作系统信息
os = Object(UserAgentOS, required=False)
# bjy: 嵌套设备信息
device = Object(UserAgentDevice, required=False)
# bjy: 原始User-Agent字符串
string = Text()
# bjy: 是否为爬虫或机器人
is_bot = Boolean()
# bjy: 定义一个Elasticsearch文档用于存储页面性能数据如响应时间
class ElapsedTimeDocument(Document):
# bjy: 请求的URL
url = Keyword()
# bjy: 请求耗时(毫秒)
time_taken = Long()
# bjy: 日志记录时间
log_datetime = Date()
# bjy: 客户端IP地址
ip = Keyword()
# bjy: 嵌套的IP地理位置信息
geoip = Object(GeoIp, required=False)
# bjy: 嵌套的用户代理信息
useragent = Object(UserAgent, required=False)
class Index:
# bjy: 指定索引名称为'performance'
name = 'performance'
# bjy: 设置索引的分片和副本数
settings = {
"number_of_shards": 1,
"number_of_replicas": 0
}
class Meta:
# bjy: 指定文档类型
doc_type = 'ElapsedTime'
# bjy: 定义一个管理类用于操作ElapsedTimeDocument索引
class ElaspedTimeDocumentManager:
@staticmethod
def build_index():
# bjy: 如果索引不存在,则创建它
from elasticsearch import Elasticsearch
client = Elasticsearch(settings.ELASTICSEARCH_DSL['default']['hosts'])
res = client.indices.exists(index="performance")
if not res:
ElapsedTimeDocument.init()
@staticmethod
def delete_index():
# bjy: 删除'performance'索引
from elasticsearch import Elasticsearch
es = Elasticsearch(settings.ELASTICSEARCH_DSL['default']['hosts'])
es.indices.delete(index='performance', ignore=[400, 404])
@staticmethod
def create(url, time_taken, log_datetime, useragent, ip):
# bjy: 确保索引存在
ElaspedTimeDocumentManager.build_index()
# bjy: 构建UserAgent内部文档对象
ua = UserAgent()
ua.browser = UserAgentBrowser()
ua.browser.Family = useragent.browser.family
ua.browser.Version = useragent.browser.version_string
ua.os = UserAgentOS()
ua.os.Family = useragent.os.family
ua.os.Version = useragent.os.version_string
ua.device = UserAgentDevice()
ua.device.Family = useragent.device.family
ua.device.Brand = useragent.device.brand
ua.device.Model = useragent.device.model
ua.string = useragent.ua_string
ua.is_bot = useragent.is_bot
# bjy: 创建ElapsedTimeDocument文档实例
doc = ElapsedTimeDocument(
meta={
# bjy: 使用当前时间的毫秒数作为文档ID
'id': int(
round(
time.time() *
1000))
},
url=url,
time_taken=time_taken,
log_datetime=log_datetime,
useragent=ua, ip=ip)
# bjy: 保存文档,并使用'geoip'管道处理IP地址
doc.save(pipeline="geoip")
# bjy: 定义一个Elasticsearch文档用于存储博客文章数据以支持全文搜索
class ArticleDocument(Document):
# bjy: 文章内容使用ik分词器进行索引和搜索
body = Text(analyzer='ik_max_word', search_analyzer='ik_smart')
# bjy: 文章标题使用ik分词器
title = Text(analyzer='ik_max_word', search_analyzer='ik_smart')
# bjy: 作者信息,为一个对象类型
author = Object(properties={
'nickname': Text(analyzer='ik_max_word', search_analyzer='ik_smart'),
'id': Integer()
})
# bjy: 分类信息,为一个对象类型
category = Object(properties={
'name': Text(analyzer='ik_max_word', search_analyzer='ik_smart'),
'id': Integer()
})
# bjy: 标签信息,为一个对象类型
tags = Object(properties={
'name': Text(analyzer='ik_max_word', search_analyzer='ik_smart'),
'id': Integer()
})
# bjy: 发布时间
pub_time = Date()
# bjy: 文章状态
status = Text()
# bjy: 评论状态
comment_status = Text()
# bjy: 文章类型
type = Text()
# bjy: 浏览量
views = Integer()
# bjy: 文章排序权重
article_order = Integer()
class Index:
# bjy: 指定索引名称为'blog'
name = 'blog'
# bjy: 设置索引的分片和副本数
settings = {
"number_of_shards": 1,
"number_of_replicas": 0
}
class Meta:
# bjy: 指定文档类型
doc_type = 'Article'
# bjy: 定义一个管理类用于操作ArticleDocument索引
class ArticleDocumentManager():
def __init__(self):
# bjy: 初始化时创建索引
self.create_index()
def create_index(self):
# bjy: 创建'blog'索引
ArticleDocument.init()
def delete_index(self):
# bjy: 删除'blog'索引
from elasticsearch import Elasticsearch
es = Elasticsearch(settings.ELASTICSEARCH_DSL['default']['hosts'])
es.indices.delete(index='blog', ignore=[400, 404])
def convert_to_doc(self, articles):
# bjy: 将Django的Article查询集转换为ArticleDocument对象列表
return [
ArticleDocument(
meta={
'id': article.id},
body=article.body,
title=article.title,
author={
'nickname': article.author.username,
'id': article.author.id},
category={
'name': article.category.name,
'id': article.category.id},
tags=[
{
'name': t.name,
'id': t.id} for t in article.tags.all()],
pub_time=article.pub_time,
status=article.status,
comment_status=article.comment_status,
type=article.type,
views=article.views,
article_order=article.article_order) for article in articles]
def rebuild(self, articles=None):
# bjy: 重建索引。如果未提供articles则使用所有文章
ArticleDocument.init()
articles = articles if articles else Article.objects.all()
docs = self.convert_to_doc(articles)
# bjy: 遍历并保存每个文档
for doc in docs:
doc.save()
def update_docs(self, docs):
# bjy: 更新一组文档
for doc in docs:
doc.save()