diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a76c352
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+debug.log
+*/__pycache__
+*/*/__pycache__
+.idea
+*.pyc
diff --git a/app_test/ARIMA.py b/app_test/ARIMA.py
new file mode 100644
index 0000000..76477cf
--- /dev/null
+++ b/app_test/ARIMA.py
@@ -0,0 +1,76 @@
+from typing import List, Union
+
+import numpy as np
+import pandas as pd
+import pmdarima as pm
+
+
+def train_ARIMA_model(endog: Union[np.ndarray, pd.Series],
+ exog: Union[np.ndarray, pd.DataFrame] = None,
+ exog_pred: Union[np.ndarray, pd.DataFrame] = None,
+ steps: int = 20,
+ information_criterion: str = 'aic') -> np.ndarray:
+ """
+ 使用ARIMA模型对时间序列数据进行预测。
+
+ Args:
+ endog (Union[np.ndarray, pd.Series]): 要分析的时间序列数据。
+ exog (Union[np.ndarray, pd.DataFrame], optional): 用于改进ARIMA模型的外生变量。默认为None。
+ exog_pred (Union[np.ndarray, pd.DataFrame], optional): 预测期间的外生变量,必须与训练期间的外生变量列数一致。默认为None。
+ steps (int, optional, default=20): 预测期的长度。
+ information_criterion (str, optional, default='aic'): 选择模型的信息准则,'aic'或'bic'。
+
+ Returns:
+ np.ndarray: 预测结果。
+ """
+ model = pm.auto_arima(endog,
+ X=exog,
+ seasonal=False,
+ information_criterion=information_criterion)
+
+ pred = model.predict(n_periods=steps, X=exog_pred)
+ return pred
+
+
+def ARIMA_run(input_data: pd.DataFrame,
+ forecast_target: str,
+ exog_columns: List[str],
+ steps: int = 20) -> pd.DataFrame:
+ """
+ 主运行函数,用以读取数据、训练模型、预测数据。
+
+ Args:
+ input_data (pd.DataFrame): 输入的时间序列数据。
+ forecast_target (str): 需要被预测的目标变量的列名。
+ exog_columns (List[str]): 外生变量的列名列表。
+ steps (int, optional, default=20): 预测步长
+
+ Returns:
+ pd.DataFrame: 预测结果的DataFrame对象。
+ """
+ # 创建一个未来日期的索引,用于保存预测数据
+ future_index = pd.date_range(start=input_data.index.max() +
+ pd.Timedelta(days=1),
+ periods=steps)
+
+ # 创建一个用于保存预测外生变量的空数据帧
+ df_exog = pd.DataFrame(index=future_index)
+
+ # 循环每个外生变量,使用ARIMA模型进行训练和预测,然后将预测值保存到df_exog中
+ for exog in exog_columns:
+ pred = train_ARIMA_model(endog=input_data[exog], steps=steps)
+ df_exog[exog] = pred
+
+ # 使用ARIMA模型对目标变量进行训练和预测,注意这里将df_exog作为预测阶段的外生变量传入
+ pred = train_ARIMA_model(endog=input_data[forecast_target],
+ exog=input_data[exog_columns],
+ exog_pred=df_exog[exog_columns],
+ steps=steps,
+ information_criterion='bic')
+
+ # 根据预测值创建一个新的数据帧,用于保存预测的目标变量
+ forecast_df = pd.DataFrame(pred,
+ index=future_index,
+ columns=[forecast_target])
+
+ return forecast_df
\ No newline at end of file
diff --git a/app_test/Not in a section.py b/app_test/Not in a section.py
new file mode 100644
index 0000000..4287ca8
--- /dev/null
+++ b/app_test/Not in a section.py
@@ -0,0 +1 @@
+#
\ No newline at end of file
diff --git a/app_test/RF.py b/app_test/RF.py
new file mode 100644
index 0000000..6601c0d
--- /dev/null
+++ b/app_test/RF.py
@@ -0,0 +1,118 @@
+from typing import List
+
+import numpy as np
+import pandas as pd
+from sklearn.ensemble import RandomForestRegressor
+
+
+def random_forest_model(train_data: pd.DataFrame,
+ forecast_target: str,
+ exog_columns: List[str],
+ future_data: pd.DataFrame,
+ steps: int = 20) -> pd.DataFrame:
+ """
+ 使用随机森林模型根据给定的特征和目标数据进行训练,并预测未来数据。
+
+ Args:
+ train_data (pd.DataFrame): 训练数据集。
+ forecast_target (str): 训练数据集中的目标列的列名。
+ exog_columns (List[str): 训练数据集用于预测的特征列名的列表。
+ future_data (pd.DataFrame): 存储未来预测所用的外生变量的数据集。
+ steps (int, optional, default=20): 要进行预测的天数。
+
+ Returns:
+ pd.DataFrame: 存储预测结果的数据表。
+ """
+ # 制作输入特征和目标变量
+ X = train_data[exog_columns].values
+ y = train_data[forecast_target].values
+ X_test = future_data[exog_columns].values
+
+ model = RandomForestRegressor(n_estimators=1200,
+ max_depth=8,
+ min_samples_split=2,
+ random_state=0)
+
+ model.fit(X, y)
+
+ pred = model.predict(X_test[-steps:])
+
+ forecast_df = pd.DataFrame(
+ pred,
+ index=pd.date_range(start=train_data.index.max() +
+ pd.Timedelta(days=1),
+ periods=steps),
+ columns=[forecast_target])
+
+ return forecast_df
+
+
+def forecast_future(data: np.ndarray, steps: int = 20) -> List:
+ """
+ 使用随机森林预测未来的数据。
+
+ Args:
+ data (np.ndarray): 已知的用于预测的数据。
+ steps (int, optional, default=20): 要进行预测的天数。
+
+ Returns:
+ List: 存放预测结果的列表。
+ """
+ # 制作输入特征和目标变量
+ X = data[:-1].reshape(-1, 1)
+ y = data[1:]
+ X_test = [y[-1]]
+
+ # 创建和训练随机森林模型
+ model = RandomForestRegressor(n_estimators=1200,
+ max_depth=8,
+ min_samples_split=2,
+ random_state=0)
+
+ model.fit(X, y)
+
+ # 创建一个列表保存预测结果
+ pred = []
+
+ # 迭代预测下一个数据点
+ for _ in range(steps):
+ y_pred = model.predict(np.array([X_test[-1]]).reshape(-1, 1))
+ pred.append(y_pred)
+
+ # 将预测的数据点添加到下一轮的输入
+ X_test.append(y_pred)
+ return pred
+
+
+def RF_run(input_data: pd.DataFrame,
+ forecast_target: str,
+ exog_columns: List[str],
+ steps: int = 20) -> pd.DataFrame:
+ """
+ 执行数据读取、预处理、模型训练、预测并绘图等一系列步骤的主函数。
+
+ Args:
+ input_data (pd.DataFrame): 存储原始数据的DataFrame。
+ forecast_target (str): 需要被预测的目标列名。
+ exog_columns (List[str]): 特征列名的列表。
+ steps (int, optional, default=20): 需要进行预测的天数。
+
+ Returns:
+ pd.DataFrame: 存储预测结果的数据表。
+ """
+ # 创建一个未来日期的索引,用于保存预测数据
+ future_index = pd.date_range(start=input_data.index.max() +
+ pd.Timedelta(days=1),
+ periods=steps)
+
+ # 创建一个用于保存预测外生变量的空数据帧
+ df_exog = pd.DataFrame(index=future_index)
+
+ for exog in exog_columns:
+ pred = forecast_future(input_data[exog].values, steps=steps)
+ df_exog[exog] = pred
+
+ df_processed = random_forest_model(input_data, forecast_target,
+ exog_columns, df_exog, steps)
+
+ return df_processed
\ No newline at end of file
diff --git a/app_test/VAR.py b/app_test/VAR.py
new file mode 100644
index 0000000..d6660f4
--- /dev/null
+++ b/app_test/VAR.py
@@ -0,0 +1,94 @@
+import json
+from typing import List
+
+import numpy as np
+# from statsmodels.tsa.api import VAR
+import pandas as pd
+import statsmodels.api as sm
+
+
+def convert_timestamp_index(data: pd.DataFrame,
+ to_period: bool) -> pd.DataFrame:
+ """
+ 根据to_period参数,选择将数据的时间索引转换为DatetimeIndex或PeriodIndex。
+
+ Args:
+ data (pd.DataFrame): 输入的数据。
+ to_period (bool): 如果为True,则将DatetimeIndex转换为PeriodIndex;
+ 如果为False,则将PeriodIndex转换为DatetimeIndex。
+
+ Returns:
+ pd.DataFrame: 索引被转换后的数据。
+ """
+ if to_period:
+ data.index = pd.DatetimeIndex(data.index).to_period('D')
+ else:
+ data.index = data.index.to_timestamp()
+
+ return data
+
+
+def train_VAR_model(data: pd.DataFrame, max_lags: int = 30):
+ """
+ 利用输入的时间序列数据训练VAR模型,通过比较BIC值确定最优滞后阶数。
+
+ Args:
+ data (pd.DataFrame): 用于模型训练的时间序列数据。
+ max_lags (int, default=30): 最大滞后阶数,默认为 30。
+
+ Returns:
+ VARResultsWrapper: 训练得到的VAR模型。
+ """
+ model = sm.tsa.VAR(data)
+ criteria = []
+ lags = range(1, max_lags + 1)
+
+ # 通过比较每个滞后阶数模型的BIC值,选择最优滞后阶数
+ for lag in lags:
+ result = model.fit(maxlags=lag)
+ criteria.append(result.bic)
+
+ # 使用最优滞后阶数再次训练模型
+ best_lag = lags[criteria.index(min(criteria))]
+ results = model.fit(maxlags=best_lag)
+
+ return results
+
+
+def VAR_run(input_data: pd.DataFrame,
+ forecast_target: str,
+ _: List[str],
+ steps: int = 20) -> pd.DataFrame:
+ """
+ 运行函数,执行一系列步骤,包括索引转换、训练模型、数据预测。
+
+ Args:
+ input_data (pd.DataFrame): 输入的DataFrame数据。
+ forecast_target (str): 需要被预测的目标变量的列名。
+ _ (List[str]): 占位参数,用于保持和其他模型函数的接口一致性。
+ steps (int, default=20): 预测步数。
+
+ Returns:
+ pd.DataFrame: 预测结果的DataFrame对象。
+ """
+ input_data = input_data.replace([np.inf, -np.inf], np.nan).dropna()
+ # 将DataFrame对象的时间索引转换为PeriodIndex
+ input_data = convert_timestamp_index(input_data, to_period=True)
+ # 添加正则化项以确保协方差矩阵正定
+ input_data += np.random.normal(0, 1e-10, input_data.shape)
+ # 训练 VAR 模型
+ model = train_VAR_model(input_data, max_lags=10)
+
+ # 将DataFrame对象的时间索引转回原样
+ input_data = convert_timestamp_index(input_data, to_period=False)
+
+ # 利用VAR模型进行预测
+ pred = model.forecast(input_data.values[-model.k_ar:], steps=steps)
+ forecast_df = pd.DataFrame(
+ pred,
+ index=pd.date_range(start=input_data.index.max() +
+ pd.Timedelta(days=1),
+ periods=steps),
+ columns=input_data.columns)
+
+ return forecast_df[forecast_target]
diff --git a/app_test/__init__.py b/app_test/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app_test/add_fund_data.py b/app_test/add_fund_data.py
new file mode 100644
index 0000000..b289944
--- /dev/null
+++ b/app_test/add_fund_data.py
@@ -0,0 +1,49 @@
+import os
+
+import pandas as pd
+# add_fund_data.py
+from app_test.tiantian_jijin_spider import get_fund_data_by_code
+
+
+def normalize_df(df):
+ """
+ 对 DataFrame 对象进行最小最大标准化。
+
+ Args:
+ df (DataFrame): 要进行标准化的 DataFrame 对象。
+
+ Returns:
+ df_normalized (DataFrame): 进行最小最大标准化后的 DataFrame 对象。
+ """
+ # 如果列的数据类型是布尔值、有符号整型、无符号整型、浮点数或复数浮点数的话,就进行最大最小标准化,否则保留原列的数据
+ df_normalized = df.apply(lambda x: (x - x.min()) / (x.max() - x.min())
+ if x.dtype.kind in 'biufc' else x)
+
+ return df_normalized
+def add_fund_data(fund_code):
+ df = pd.read_csv('filled_row_data.csv')
+ # print(df)
+ fund_data = get_fund_data_by_code(fund_code)
+
+ # print('基金数据')
+
+
+ # print(fund_data)
+ #将需要预测的因变量基金数据添加到预处理后数据
+ # del fund_data['id'] # 删除 'id' 列
+ df_merged = pd.merge(df, fund_data, how='inner', on='date')
+ df_merged['date'] = pd.to_datetime(df_merged['date'])
+ df_merged.set_index('date', inplace=True)
+ # print(type(df_merged.index.max()))
+
+ # print('开始保存数据')
+ # df_merged.to_csv('row_data.csv',encoding='utf-8')
+ # 对缺失值进行线性插值(其他方法?多项插值?)
+ df_merged = df_merged.interpolate()
+
+ # 如果有剩余的NaN值,删除这些行
+ df_merged.dropna(inplace=True)
+ # df_merged = normalize_df(df_merged)
+ # print(df_merged)
+ print('添加基金数据成功')
+ return df_merged
diff --git a/app_test/admin.py b/app_test/admin.py
new file mode 100644
index 0000000..8c38f3f
--- /dev/null
+++ b/app_test/admin.py
@@ -0,0 +1,3 @@
+from django.contrib import admin
+
+# Register your models here.
diff --git a/app_test/apps.py b/app_test/apps.py
new file mode 100644
index 0000000..678ae9f
--- /dev/null
+++ b/app_test/apps.py
@@ -0,0 +1,10 @@
+# myapp/apps.py
+from django.apps import AppConfig
+
+class AppTestConfig(AppConfig):
+ name = 'app_test'
+
+ def ready(self):
+ # from .tasks import setup_periodic_tasks
+ # setup_periodic_tasks()
+ pass
\ No newline at end of file
diff --git a/app_test/beijing_zhoubao_spider.py b/app_test/beijing_zhoubao_spider.py
new file mode 100644
index 0000000..633851e
--- /dev/null
+++ b/app_test/beijing_zhoubao_spider.py
@@ -0,0 +1,148 @@
+import asyncio
+import os
+import random
+import re
+import time
+from datetime import datetime, timedelta, date
+from multiprocessing.pool import ThreadPool
+
+import django
+import matplotlib.pyplot as plt
+import pandas as pd
+import requests
+from django.db import IntegrityError
+from lxml import etree
+from pylab import mpl
+
+from .models import BeijingWeekData
+from .user_agents_pool import agent_list # 确保 user_agents_pool.py 文件在当前目录,并包含 agent_list
+
+os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'liugan_yuce.liugan_yuce.settings')
+django.setup()
+
+mpl.rcParams["font.sans-serif"] = ["SimHei"]
+mpl.rcParams["axes.unicode_minus"] = False
+
+class GetBeijingGanranShuju(object):
+ def __init__(self):
+ user_agent = random.choice(agent_list)
+ self.headers = {
+ "User-Agent": user_agent,
+ }
+ self.data = []
+ self.link_list_2023 = []
+ self.link_list_2024 = []
+
+ def get_Link_2023(self, url):
+ response = requests.get(url=url, headers=self.headers)
+ time.sleep(random.uniform(1, 3))
+ html = response.content.decode("utf-8")
+ link_2023 = re.findall('', html)
+ for i in link_2023:
+ url_head = "https://www.bjcdc.org/"
+ i = url_head + i
+ self.link_list_2023.append(i)
+ return self.link_list_2023
+
+ def get_Link_2024(self, url):
+ response = requests.get(url=url, headers=self.headers)
+ time.sleep(random.uniform(1, 3))
+ html = response.content.decode("utf-8")
+ link_2024 = re.findall('', html)
+ for i in link_2024:
+ url_head = "https://www.bjcdc.org/"
+ i = url_head + i
+ self.link_list_2024.append(i)
+ return self.link_list_2024
+
+ def get_content_2023(self, link):
+ response = requests.get(url=link, headers=self.headers)
+ import time
+ time.sleep(random.uniform(1, 3))
+ html = response.content.decode("utf-8")
+ number_list = re.findall(r'(\d+)例', html, re.DOTALL)
+ number = number_list[0] if number_list else ''
+ time_list = re.findall(r'(\d+月\d+日至2023年\d+月\d+日)', html)
+ if time_list:
+ time_str = time_list[0]
+ time1 = re.match(r'\d+月\d+日?', time_str).group()
+ month_number = re.match(r'\d{1,2}', time1).group()
+ day_number = re.findall(r'月(\d{1,2})', time1)[0]
+ time = f'2023-{int(month_number):02d}-{int(day_number):02d}'
+ if number.isdigit():
+ self.data.append([time, number])
+
+ def get_content_2024(self, link):
+ response = requests.get(url=link, headers=self.headers)
+ import time
+ time.sleep(random.uniform(1, 3))
+ html = response.content.decode("utf-8")
+ if '周' not in html:
+ number_list = re.findall(r'(\d+)例', html, re.DOTALL)
+ number = number_list[0] if number_list else ''
+ time_list = re.findall(r'(\d+年\d+月)', html)
+ if time_list:
+ time = time_list[0]
+ if number.isdigit():
+ self.month_data.append([time, number])
+
+def get_beijing_zhoubao():
+ # 创建获取 获取北京传染病数据 类的实例
+ get_beijing_ganran_shuju = GetBeijingGanranShuju()
+ url_1 = ['https://www.bjcdc.org/cdcmodule/jkdt/yqbb/index.shtml']
+ url_list2 = [f'https://www.bjcdc.org/cdcmodule/jkdt/yqbb/index_{i}.shtml' for i in range(2, 5)]
+ url_list = url_1 + url_list2
+
+ # 2023
+ for url in url_list:
+ get_beijing_ganran_shuju.get_Link_2023(url)
+
+ # 使用多进程处理每个块
+ pool = ThreadPool(100)
+ pool.map(get_beijing_ganran_shuju.get_content_2023, reversed(get_beijing_ganran_shuju.link_list_2023))
+ pool.close()
+ pool.join()
+
+ # 2024
+ get_beijing_ganran_shuju.month_data = []
+ for url in url_list:
+ get_beijing_ganran_shuju.get_Link_2024(url)
+ for x in reversed(get_beijing_ganran_shuju.link_list_2024):
+ get_beijing_ganran_shuju.get_content_2024(x)
+
+ df = pd.DataFrame(get_beijing_ganran_shuju.data, columns=['日期', '感染数量'])
+ df = df[df['日期'] != '2023-12-26']
+ df['日期'] = pd.to_datetime(df['日期'])
+ df_week = df.sort_values(by='日期')
+ from datetime import date
+ today = date.today()
+ start_date = datetime(2024, 1, 2)
+ end_date = datetime.now()
+
+ dates = []
+ while start_date <= end_date:
+ dates.append(start_date)
+ start_date += timedelta(days=7)
+
+ infections = {datetime.strptime(month, "%Y年%m月").strftime("%Y-%m"): int(int(total) / 4) for month, total in get_beijing_ganran_shuju.month_data}
+
+ date_infections = []
+ for date in dates:
+ month_key = date.strftime("%Y-%m")
+ if month_key in infections:
+ date_infections.append([date, infections[month_key]])
+
+ month_df = pd.DataFrame(date_infections, columns=['日期', '感染数量'])
+ df = pd.concat([df_week, month_df])
+ df = df.rename(columns={'日期': 'date', '感染数量': 'beijing_number'})
+ print(df)
+
+ converted_data = df.values.tolist()
+ for data in converted_data:
+ obj, created = BeijingWeekData.objects.get_or_create(date=data[0], defaults={'infection_number': data[1]})
+ if created:
+ print(f"Added new record for date {data[0]} with infections {data[1]}")
+ else:
+ print(f"Record for date {data[0]} already exists.")
+ print('成功载入数据库')
+
diff --git a/app_test/data_merged.py b/app_test/data_merged.py
new file mode 100644
index 0000000..6390d62
--- /dev/null
+++ b/app_test/data_merged.py
@@ -0,0 +1,71 @@
+import numpy as np
+import pandas as pd
+from pylab import mpl
+
+mpl.rcParams["font.sans-serif"] = ["SimHei"]
+mpl.rcParams["axes.unicode_minus"] = False
+
+df_baidu = pd.read_csv('../data/baidu_index.csv',encoding = 'utf-8')# 百度流感指数
+df_beijing = pd.read_csv('../data/beijin_zhoubao.csv',encoding = 'utf-8')# 北京传染病周报
+df_liugan = pd.read_csv('../data/liugan_zhoubao.csv',encoding = 'utf-8')# 流感周报
+df_hx = pd.read_csv('../data/hx_jijin_data.csv',encoding = 'utf-8')# 流感基金——华商医药医疗行业
+df_gf = pd.read_csv('../data/gf_jijin_data.csv',encoding = 'utf-8')# 流感基金——广发创新医疗两年持有混合
+# 确保日期列是日期类型
+df_baidu['date'] = pd.to_datetime(df_baidu['date'])
+df_beijing['date'] = pd.to_datetime(df_beijing['date'])
+df_liugan['date'] = pd.to_datetime(df_liugan['date'])
+df_hx['date'] = pd.to_datetime(df_hx['date'])
+df_gf['date'] = pd.to_datetime(df_gf['date'])
+df1 = df_baidu
+df2 = df_beijing
+df3 = df_liugan
+df4 = df_hx
+df5 = df_gf
+# 创建一个完整的日期范围
+all_dates = pd.date_range(start=min(df1['date'].min(), df2['date'].min(), df3['date'].min()),
+ end=max(df1['date'].max(), df2['date'].max(), df3['date'].max()))
+# 重新索引每个DataFrame以包括所有日期
+df1 = df1.set_index('date').reindex(all_dates).ffill().reset_index().rename(columns={'index': 'date'})
+df2 = df2.set_index('date').reindex(all_dates).ffill().reset_index().rename(columns={'index': 'date'})
+df3 = df3.set_index('date').reindex(all_dates).ffill().reset_index().rename(columns={'index': 'date'})
+df4 = df4.set_index('date').reindex(all_dates).ffill().reset_index().rename(columns={'index': 'date'})
+df5 = df5.set_index('date').reindex(all_dates).ffill().reset_index().rename(columns={'index': 'date'})
+df1.drop(columns=['Unnamed: 0'], inplace=True)
+df2.drop(columns=['Unnamed: 0'], inplace=True)
+df3.drop(columns=['Unnamed: 0'], inplace=True)
+df4.drop(columns=['Unnamed: 0'], inplace=True)
+df5.drop(columns=['Unnamed: 0'], inplace=True)
+# 合并数据集
+df_merged = df1.merge(df2, on='date', how='outer').merge(df3, on='date', how='outer').merge(df4, on='date', how='outer').merge(df5, on='date', how='outer')
+df_merged = df_merged[['date', 'liugan_index', 'beijing_number', 'infection_number','hx_jijin_data','gf_jijin_data']]
+
+# 输出合并后的DataFrame
+# print(df_merged.head(20))
+#缺失值处理
+# df = df_merged.dropna(how= 'any')
+# 确保'date'列是日期格式,并设置为索引
+df_merged['date'] = pd.to_datetime(df_merged['date'])
+df_merged.set_index('date', inplace=True)
+
+# 只对非日期列转换数据类型
+numerical_columns = df_merged.columns.difference(['date']) # 排除'date'列
+df_merged[numerical_columns] = df_merged[numerical_columns].astype(float)
+
+# 确保数据类型正确,并查找是否有NaN或inf值
+df_merged = df_merged.astype(float)
+print("Initial NaN or Inf check:", df_merged.isin([np.inf, -np.inf]).sum(), df_merged.isna().sum())
+
+# 处理NaN值和无穷大值
+df_merged.replace([np.inf, -np.inf], np.nan, inplace=True)
+df_merged.ffill() # 使用前向填充处理NaN值
+df_merged.dropna(inplace=True) # 如果有剩余的NaN值,删除这些行
+df_merged.to_csv('../data/merged_data.csv',encoding='utf-8')
+from sklearn.preprocessing import MinMaxScaler
+scaler = MinMaxScaler()
+# 选择需要归一化的列
+columns_to_scale = ['liugan_index', 'beijing_number', 'infection_number','hx_jijin_data','gf_jijin_data']
+# 对选定的列进行归一化处理
+df_merged[columns_to_scale] = scaler.fit_transform(df_merged[columns_to_scale])
+# 查看归一化后的数据
+print(df_merged.head())
+df_merged.to_csv('../data/merged_data.csv',index=True,encoding = 'utf-8')
diff --git a/app_test/deley_test.py b/app_test/deley_test.py
new file mode 100644
index 0000000..281af09
--- /dev/null
+++ b/app_test/deley_test.py
@@ -0,0 +1,2 @@
+from app_test.tasks import my_scheduled_task
+my_scheduled_task.delay()
diff --git a/app_test/forms.py b/app_test/forms.py
new file mode 100644
index 0000000..bcfa66b
--- /dev/null
+++ b/app_test/forms.py
@@ -0,0 +1,39 @@
+from django import forms
+from django.contrib.auth import get_user_model
+
+from .models import CaptchaModel, Fund
+
+User = get_user_model()
+class FundForm(forms.ModelForm):
+ class Meta:
+ model = Fund
+ fields = ['fund_id', 'fund_name']
+class RegisterForm(forms.Form):
+ username = forms.CharField(max_length=20,min_length=2,error_messages={
+ 'required':'请输入用户名',
+ 'max_length':'用户长度在2~20之间',
+ 'min_length':'用户长度在2~20之间'
+ })
+ email = forms.EmailField(error_messages={'required':'请输入邮箱','invalid':'请输入一个正确的邮箱!'})
+ password = forms.CharField(max_length=20,min_length=6)
+
+ def clean_email(self):
+ email = self.cleaned_data.get('email')
+ exists = User.objects.filter(email=email).exists()
+ if exists:
+ raise forms.ValidationError('邮箱已经被注册')
+ return email
+ def clean_captcha(self):
+ captcha = self.cleaned_data_get('captcha')
+ email = self.cleaned_data_get('email')
+ creat_time = self.cleaned_data_get('creat_time')
+ captcha_model = Captcha.objects.filter(email=email,captcha=captcha).first()
+ if not captcha_model:
+ raise foroms.ValidationError('验证码错误')
+ return captcha
+
+class LoginForm(forms.Form):
+ email = forms.EmailField(error_messages={"required": '请传入邮箱!', 'invalid': '请传入一个正确的邮箱!'})
+ password = forms.CharField(max_length=20, min_length=6)
+ remember = forms.IntegerField(required=False)
+
diff --git a/app_test/get_baidu_index.py b/app_test/get_baidu_index.py
new file mode 100644
index 0000000..e8172ee
--- /dev/null
+++ b/app_test/get_baidu_index.py
@@ -0,0 +1,129 @@
+import random
+import re
+import time
+from datetime import datetime, timedelta, date
+
+import pandas as pd
+import requests
+from pylab import mpl
+
+from .models import BaiduData
+from .user_agents_pool import *
+
+mpl.rcParams["font.sans-serif"] = ["SimHei"]
+mpl.rcParams["axes.unicode_minus"] = False
+
+class DownloadBaiDuIndex(object):
+#创建一个类来下载百度指数
+ def __init__(self, cookie):
+ self.cookie = cookie
+ #配置请求头
+ self.headers = {
+ "Connection": "keep-alive",
+ "Accept": "application/json, text/plain, */*",
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
+ "Sec-Fetch-Site": "same-origin",
+ "Sec-Fetch-Mode": "cors",
+ "Sec-Fetch-Dest": "empty",
+ "Referer": "https://index.baidu.com/v2/main/index.html",
+ "Accept-Language": "zh-CN,zh;q=0.9",
+ 'Cookie': self.cookie,
+ "Host": "index.baidu.com",
+ "X-Requested-With": "XMLHttpRequest",
+ "Cipher-Text": "1656572408684_1656582701256_Nvm1pABkNsfD7V9VhZSzzFiFKylr3l5NR3YDrmHmH9yfFicm+Z9kmmwKVqVV6unvzAEh5hgXmgelP+OyOeaK8F21LyRVX1BDjxm+ezsglwoe1yfp6lEpuvu5Iggg1dz3PLF8e2II0e80ocXeU0jQFBhSbnB2wjhKl57JggTej12CzuL+h9eeVWdaMO4DSBWU2XX6PfbN8pv9+cdfFhVRHCzb0BJBU3iccoFczwNQUvzLn0nZsu0YPtG5DxDkGlRlZrCfKMtqKAe1tXQhg3+Oww4N3CQUM+6A/tKZA7jfRE6CGTFetC7QQyKlD7nxabkQ5CReAhFYAFAVYJ+sEqmY5pke8s3+RZ6jR7ASOih6Afl35EArbJzzLpnNPgrPCHoJiDUlECJveul7P5vvXl/O/Q==",
+
+ }
+ def decrypt(self, ptbk, index_data):
+ n = len(ptbk) // 2
+ a = dict(zip(ptbk[:n], ptbk[n:]))
+ return "".join([a[s] for s in index_data])
+ def get_index_data_json(self, keys, start=None, end=None):
+ words = [[{"name": key, "wordType": 1}] for key in keys]
+ words = str(words).replace(" ", "").replace("'", "\"")
+ url = f'http://index.baidu.com/api/SearchApi/index?area=0&word={words}&area=0&startDate={start}&endDate={end}'
+ res = requests.get(url, headers=self.headers)
+ html = res.content.decode("UTF-8")
+ data = res.json()['data']
+ uniqid = data['uniqid']
+ url = f'http://index.baidu.com/Interface/ptbk?uniqid={uniqid}'
+ # print(url)
+ res = requests.get(url, headers=self.headers)
+ html2 = res.content.decode("UTF-8")
+ time.sleep(3)
+ ptbk = res.json()['data']
+ result = {}
+ result["startDate"] = start
+ result["endDate"] = end
+ for userIndexe in data['userIndexes']:
+ name = userIndexe['word'][0]['name']
+ tmp = {}
+ index_all = userIndexe['all']['data']
+ index_all_data = [int(e) for e in self.decrypt(ptbk, index_all).split(",")]
+ tmp["all"] = index_all_data
+ index_pc = userIndexe['pc']['data']
+ index_pc_data = [int(e) for e in self.decrypt(ptbk, index_pc).split(",")]
+ tmp["pc"] = index_pc_data
+ index_wise = userIndexe['wise']['data']
+ index_wise_data = [int(e)
+ for e in self.decrypt(ptbk, index_wise).split(",")]
+ tmp["wise"] = index_wise_data
+ result[name] = tmp
+ return result
+ def GetIndex(self, keys, start=None, end=None):
+ today = date.today()
+ if start is None:
+ start = str(today - timedelta(days=8))
+ if end is None:
+ end = str(today - timedelta(days=2))
+ try:
+ raw_data = self.get_index_data_json(keys=keys, start=start, end=end)
+ raw_data = pd.DataFrame(raw_data[keys[0]])
+ raw_data.index = pd.date_range(start=start, end=end)
+ except Exception as e:
+ print(e)
+ raw_data = pd.DataFrame({'all': [], 'pc': [], 'wise': []})
+ # 分别表示总计,PC端,移动端
+ finally:
+ return raw_data
+
+def get_baidu_index():
+ cookie = 'BIDUPSID=84B8FDC3134DE2D8E0E6B86E2BFCC3DC; PSTM=1697213335; BAIDUID=84B8FDC3134DE2D8E0E6B86E2BFCC3DC:SL=0:NR=10:FG=1; BAIDUID_BFESS=84B8FDC3134DE2D8E0E6B86E2BFCC3DC:SL=0:NR=10:FG=1; Hm_lvt_d101ea4d2a5c67dab98251f0b5de24dc=1701483117; BDUSS=RUU3ZtM0RwcU9VeW0zV0ltMGhWZXNvd3hoMXc3YmtoZmxOOXktTDNFM3JMNUpsRUFBQUFBJCQAAAAAAQAAAAEAAADwtxh-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOuiamXromplSH; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a04514997999zSyIXXcI1QTeZqm4c8hyxlWksvkordeK7x1ZPceY2CR3NLufUujm7MOZ3p6TYUaUvd3Qjet3M3JcQfM5hy8%2FuP9HNu4dCG7B6RoS3S4L25PQZlnh3joEA0cArzaShqjtNyIlDOFD7nF4m%2FHL%2FxUXMnks0IYh6ZyO0xZ1iCY3pJruPDK3dBKJPJ%2BTsLIUPckisDLv5o4FBynumqVmNrIcRJauvv%2BcQtioTBjGMshtfwaZjDT2WCz713NtlH6uxabBdf8gRHMu6r8uSWjXKPG3dAflk5ycDG%2F1BoioLYK697k%3D91877884685963653296273632513192; __cas__rn__=451499799; __cas__st__212=b5f51a7b5b20cb36d3ced6764c8b0e567b436d1a2aa46e1f861833387e9d43267ac11419a4d630081274b162; __cas__id__212=51862268; CPTK_212=1671659797; CPID_212=51862268; bdindexid=473uetvtav5o3d1jfb3m9s3d34; RT="z=1&dm=baidu.com&si=0751b751-3767-4525-9566-4b5f1cd26e3a&ss=lpnhlcxe&sl=8&tt=fr3&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf"; Hm_lpvt_d101ea4d2a5c67dab98251f0b5de24dc=1701490081; ab_sr=1.0.1_MjQ2ODNmNmI4NzI5MzFhZDAxYzIzZDQzYmMyZDAwOTZiYWE5NDY4OGQxMDNkYzA0NGM4OGU1ZDk5YjZmYjdkMTkyNTYxMDJiZmVlMjllNGU1MWQ1YjgwYTAzZGQxMWFkYzEyMDQ3ZjYxMThkNWI1NTg1ZTliOWVmYTQ1M2E3NjhmMDUzNTllNjU3YzYwNDlhOTU0ODRhMzJlZDAwMWY5Yg==; BDUSS_BFESS=RUU3ZtM0RwcU9VeW0zV0ltMGhWZXNvd3hoMXc3YmtoZmxOOXktTDNFM3JMNUpsRUFBQUFBJCQAAAAAAQAAAAEAAADwtxh-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOuiamXromplSH'
+ # 初始化一个实例
+ downloadbaiduindex = DownloadBaiDuIndex(cookie=cookie)
+ # key = input('请输入关键词')
+ key = '流感'
+ # 获取当天时间
+
+ # from datetime import date
+ today = str(date.today())
+ data = downloadbaiduindex.get_index_data_json(keys=[key], start='2023-01-01', end=today)
+ liugan_data = (data['流感']['all'])
+
+ # 设定起始日期和终止日期
+ start_date = date(2023, 1, 1)
+ end_date = datetime.now().date() + timedelta(days=7)
+
+ # 创建日期列表,间隔为一周
+ date_list = []
+ current_date = start_date
+ while current_date <= end_date:
+ date_list.append(current_date)
+ current_date += timedelta(weeks=1) # 每次增加一周
+ date_list = date_list[:len(liugan_data)]
+
+ df = pd.DataFrame({
+ 'date': date_list,
+ 'liugan_index': liugan_data
+ })
+ df = df.drop(df.index[-1])
+ print(df)
+ converted_data = df.values.tolist()
+ for data in converted_data:
+ # 使用get_or_create来避免重复数据
+ obj, created = BaiduData.objects.get_or_create(date=data[0], defaults={'liugan_index': data[1]})
+ if created:
+ print(f"Added new record for date {data[0]} with infections {data[1]}")
+ else:
+ print(f"Record for date {data[0]} already exists.")
+ print('成功载入数据库')
+# 调用函数
diff --git a/app_test/liugan_zhoubao_spider.py b/app_test/liugan_zhoubao_spider.py
new file mode 100644
index 0000000..a08b2cb
--- /dev/null
+++ b/app_test/liugan_zhoubao_spider.py
@@ -0,0 +1,125 @@
+import datetime
+import os
+import random
+import re
+import time
+from datetime import datetime
+from multiprocessing.pool import ThreadPool
+
+import django
+import requests
+from app_test.models import LiuganWeekData
+from django.db import IntegrityError
+from lxml import etree
+from tqdm import *
+
+from .user_agents_pool import *
+
+os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'liugan_yuce.liugan_yuce.settings')
+django.setup()
+
+# 现在你可以安全地使用 Django 的模型和其他组件了
+
+url_1=['https://ivdc.chinacdc.cn/cnic/zyzx/lgzb/index.htm']
+url_list2=[f'https://ivdc.chinacdc.cn/cnic/zyzx/lgzb/index_{i}.htm' for i in range(1,4)]
+url_list=url_1+url_list2
+
+user_Agent = random.choice(agent_list)
+headers = {
+ "User-Agent": user_Agent,
+}
+def get_Link(url):
+ link_list = []
+ response = requests.get(url=url, headers=headers)
+ time.sleep(1)
+ html = response.content.decode("utf-8")
+ tree = etree.HTML(html)
+ li_list = tree.xpath('/html/body/div[2]/div/div[1]/div/div[2]/ul/li')
+ # print(len(li_list))
+ for table in li_list:
+ link = table.xpath("./span[1]/a/@href")[0]
+ link = link.replace('.','')
+ url_head = "https://ivdc.chinacdc.cn/cnic/zyzx/lgzb"
+ link = url_head + link
+ link = link.replace('htm','.htm')
+ link_list.append(link)
+ return link_list
+
+
+def get_content(link):
+ response = requests.get(url=link, headers=headers)
+ time.sleep(1.5)
+ html = response.content.decode("utf-8")
+ # print(html)
+ tree = etree.HTML(html)
+ date = tree.xpath('/html/body/div[2]/div/div[1]/div/div[2]/div/div/div/p[1]/span/text()')[1]
+ # print(time)
+ year = tree.xpath('/html/body/div[2]/div/div[1]/div/div[2]/div/div/div/p[1]/span/span/text()')[0]
+ # print(year)
+ date = year+date
+ date = date.replace(')','')
+ date_format = '%Y年%m月%d日'
+ target_date = datetime.strptime(date, date_format)
+ # print(target_date)
+ start_time = '2023年2月18日'
+ start_date = datetime.strptime(start_time, date_format)
+ if target_date > start_date:
+ specific_number = re.search(r'(.?<=font-size: 10pt;\">|)(\d+)(?=起|起)', html)
+ number = specific_number.group(2) if specific_number else None
+ if number == None:
+ pattern = r'(\d+)
+
+
+
+
+ Official open source SVG icon library for Bootstrap with over 2,000 icons.
+
+ Explore Bootstrap Icons »
+
+
+ Bootstrap
+ ·
+ Themes
+ ·
+ Blog
+
+
添加基金
+ """, unsafe_allow_html=True) + + # 创建一个输入框,使用 placeholder 参数设置提示语 + fund_code = st.text_input('基金代码', placeholder='请输入基金代码', key='fund_input', on_change=validate_fund, + label_visibility='collapsed') + + # 显示反馈信息并在显示后重置 + if st.session_state['message']: + st.write(st.session_state['message']) + st.session_state['message'] = '' # 重置消息以避免重复显示 + +# 页面刷新逻辑 +if st.session_state['trigger_rerun']: + st.session_state['trigger_rerun'] = False + st.rerun() + + + + + + + + + + diff --git a/app_test/other_pages/page3.py b/app_test/other_pages/page3.py new file mode 100644 index 0000000..13bae68 --- /dev/null +++ b/app_test/other_pages/page3.py @@ -0,0 +1,276 @@ +# import pandas as pd +# import pandas_profiling +# import streamlit as st +# # from pydantic_settings import BaseSettings +# +# from streamlit_pandas_profiling import st_profile_report +# +# df = pd.read_csv("https://storage.googleapis.com/tf-datasets/titanic/train.csv") +# pr = df.profile_report() +# +# st_profile_report(pr) +import os + +import django +from django.conf import settings + +os.chdir('D:/python/djangoProject/test_Bootstrap') +# 设置 Django 环境变量 +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'test_Bootstrap.settings') +print('开始初始化') +# 强制初始化 Django +django.setup() +print("Django configured.") + +print("Starting Streamlit...") +import streamlit as st +from streamlit_vertical_slider import vertical_slider +from st_pages import Page, Section, show_pages, add_page_title, add_indentation +add_page_title() +add_indentation() + +import streamlit as st +import streamlit as st + +# 定义点击回调函数 +def reset_weights(): + st.session_state.slider_values = [32, 12, 43, 12, 12] + st.session_state.reset_trigger += 1 + +# 初始化 session state 中的键 +if 'slider_values' not in st.session_state: + st.session_state.slider_values = [32, 12, 43, 12, 12] +if 'reset_trigger' not in st.session_state: + st.session_state.reset_trigger = 0 +if 'fund_code' not in st.session_state: + st.session_state['fund_code'] = '' +col1, col2 = st.columns([0.8, 0.2]) + +with col1: + # 使用 HTML 和内联CSS来增加字体大小 + st.markdown(""" +基金预测
+ """, unsafe_allow_html=True) + # 创建一个输入框,使用 placeholder 参数设置提示语 + fund_code = st.text_input('基金代码', placeholder='请输入基金代码', key='fund_code', on_change=fund_predect, + label_visibility='collapsed') + print(fund_code) + +def result_visualization(date_js, data_js): + html_content = f""" + + + + + """ + + # 使用 Streamlit 的 HTML 函数将 HTML 内容嵌入页面中 + components.html(html_content, height=350) diff --git a/app_test/other_pages/page4.py b/app_test/other_pages/page4.py new file mode 100644 index 0000000..e69de29 diff --git a/app_test/other_pages/page5.py b/app_test/other_pages/page5.py new file mode 100644 index 0000000..8468b31 --- /dev/null +++ b/app_test/other_pages/page5.py @@ -0,0 +1,249 @@ +import os + +import django +from django.conf import settings + +os.chdir('D:/python/djangoProject/test_Bootstrap') +# 设置 Django 环境变量 +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'test_Bootstrap.settings') +# print('开始初始化') +# # 强制初始化 Django +django.setup() +# print("Django configured.") + +print("Starting Streamlit...") +import streamlit as st +from streamlit_vertical_slider import vertical_slider +from st_pages import Page, Section, show_pages, add_page_title, add_indentation +add_page_title() +add_indentation() + +import streamlit as st +import streamlit as st + +# 定义点击回调函数 +def reset_weights(): + st.session_state.slider_values = [32, 12, 43, 12, 12] + st.session_state.reset_trigger += 1 + +# 初始化 session state 中的键 +if 'slider_values' not in st.session_state: + st.session_state.slider_values = [32, 12, 43, 12, 12] +if 'reset_trigger' not in st.session_state: + st.session_state.reset_trigger = 0 +if 'fund_code' not in st.session_state: + st.session_state['fund_code'] = '' +col1, col2 = st.columns([0.8, 0.2]) + +with col1: + # 使用 HTML 和内联CSS来增加字体大小 + st.markdown(""" +`s get reset. However, we also reset the\n// bottom margin to use `rem` units instead of `em`.\n\np {\n margin-top: 0;\n margin-bottom: $paragraph-margin-bottom;\n}\n\n\n// Abbreviations\n//\n// 1. Add the correct text decoration in Chrome, Edge, Opera, and Safari.\n// 2. Add explicit cursor to indicate changed behavior.\n// 3. Prevent the text-decoration to be skipped.\n\nabbr[title] {\n text-decoration: underline dotted; // 1\n cursor: help; // 2\n text-decoration-skip-ink: none; // 3\n}\n\n\n// Address\n\naddress {\n margin-bottom: 1rem;\n font-style: normal;\n line-height: inherit;\n}\n\n\n// Lists\n\nol,\nul {\n padding-left: 2rem;\n}\n\nol,\nul,\ndl {\n margin-top: 0;\n margin-bottom: 1rem;\n}\n\nol ol,\nul ul,\nol ul,\nul ol {\n margin-bottom: 0;\n}\n\ndt {\n font-weight: $dt-font-weight;\n}\n\n// 1. Undo browser default\n\ndd {\n margin-bottom: .5rem;\n margin-left: 0; // 1\n}\n\n\n// Blockquote\n\nblockquote {\n margin: 0 0 1rem;\n}\n\n\n// Strong\n//\n// Add the correct font weight in Chrome, Edge, and Safari\n\nb,\nstrong {\n font-weight: $font-weight-bolder;\n}\n\n\n// Small\n//\n// Add the correct font size in all browsers\n\nsmall {\n @include font-size($small-font-size);\n}\n\n\n// Mark\n\nmark {\n padding: $mark-padding;\n background-color: var(--#{$prefix}highlight-bg);\n}\n\n\n// Sub and Sup\n//\n// Prevent `sub` and `sup` elements from affecting the line height in\n// all browsers.\n\nsub,\nsup {\n position: relative;\n @include font-size($sub-sup-font-size);\n line-height: 0;\n vertical-align: baseline;\n}\n\nsub { bottom: -.25em; }\nsup { top: -.5em; }\n\n\n// Links\n\na {\n color: rgba(var(--#{$prefix}link-color-rgb), var(--#{$prefix}link-opacity, 1));\n text-decoration: $link-decoration;\n\n &:hover {\n --#{$prefix}link-color-rgb: var(--#{$prefix}link-hover-color-rgb);\n text-decoration: $link-hover-decoration;\n }\n}\n\n// And undo these styles for placeholder links/named anchors (without href).\n// It would be more straightforward to just use a[href] in previous block, but that\n// causes specificity issues in many other styles that are too complex to fix.\n// See https://github.com/twbs/bootstrap/issues/19402\n\na:not([href]):not([class]) {\n &,\n &:hover {\n color: inherit;\n text-decoration: none;\n }\n}\n\n\n// Code\n\npre,\ncode,\nkbd,\nsamp {\n font-family: $font-family-code;\n @include font-size(1em); // Correct the odd `em` font sizing in all browsers.\n}\n\n// 1. Remove browser default top margin\n// 2. Reset browser default of `1em` to use `rem`s\n// 3. Don't allow content to break outside\n\npre {\n display: block;\n margin-top: 0; // 1\n margin-bottom: 1rem; // 2\n overflow: auto; // 3\n @include font-size($code-font-size);\n color: $pre-color;\n\n // Account for some code outputs that place code tags in pre tags\n code {\n @include font-size(inherit);\n color: inherit;\n word-break: normal;\n }\n}\n\ncode {\n @include font-size($code-font-size);\n color: var(--#{$prefix}code-color);\n word-wrap: break-word;\n\n // Streamline the style when inside anchors to avoid broken underline and more\n a > & {\n color: inherit;\n }\n}\n\nkbd {\n padding: $kbd-padding-y $kbd-padding-x;\n @include font-size($kbd-font-size);\n color: $kbd-color;\n background-color: $kbd-bg;\n @include border-radius($border-radius-sm);\n\n kbd {\n padding: 0;\n @include font-size(1em);\n font-weight: $nested-kbd-font-weight;\n }\n}\n\n\n// Figures\n//\n// Apply a consistent margin strategy (matches our type styles).\n\nfigure {\n margin: 0 0 1rem;\n}\n\n\n// Images and content\n\nimg,\nsvg {\n vertical-align: middle;\n}\n\n\n// Tables\n//\n// Prevent double borders\n\ntable {\n caption-side: bottom;\n border-collapse: collapse;\n}\n\ncaption {\n padding-top: $table-cell-padding-y;\n padding-bottom: $table-cell-padding-y;\n color: $table-caption-color;\n text-align: left;\n}\n\n// 1. Removes font-weight bold by inheriting\n// 2. Matches default `