diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a76c352 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +debug.log +*/__pycache__ +*/*/__pycache__ +.idea +*.pyc diff --git a/app_test/ARIMA.py b/app_test/ARIMA.py new file mode 100644 index 0000000..76477cf --- /dev/null +++ b/app_test/ARIMA.py @@ -0,0 +1,76 @@ +from typing import List, Union + +import numpy as np +import pandas as pd +import pmdarima as pm + + +def train_ARIMA_model(endog: Union[np.ndarray, pd.Series], + exog: Union[np.ndarray, pd.DataFrame] = None, + exog_pred: Union[np.ndarray, pd.DataFrame] = None, + steps: int = 20, + information_criterion: str = 'aic') -> np.ndarray: + """ + 使用ARIMA模型对时间序列数据进行预测。 + + Args: + endog (Union[np.ndarray, pd.Series]): 要分析的时间序列数据。 + exog (Union[np.ndarray, pd.DataFrame], optional): 用于改进ARIMA模型的外生变量。默认为None。 + exog_pred (Union[np.ndarray, pd.DataFrame], optional): 预测期间的外生变量,必须与训练期间的外生变量列数一致。默认为None。 + steps (int, optional, default=20): 预测期的长度。 + information_criterion (str, optional, default='aic'): 选择模型的信息准则,'aic'或'bic'。 + + Returns: + np.ndarray: 预测结果。 + """ + model = pm.auto_arima(endog, + X=exog, + seasonal=False, + information_criterion=information_criterion) + + pred = model.predict(n_periods=steps, X=exog_pred) + return pred + + +def ARIMA_run(input_data: pd.DataFrame, + forecast_target: str, + exog_columns: List[str], + steps: int = 20) -> pd.DataFrame: + """ + 主运行函数,用以读取数据、训练模型、预测数据。 + + Args: + input_data (pd.DataFrame): 输入的时间序列数据。 + forecast_target (str): 需要被预测的目标变量的列名。 + exog_columns (List[str]): 外生变量的列名列表。 + steps (int, optional, default=20): 预测步长 + + Returns: + pd.DataFrame: 预测结果的DataFrame对象。 + """ + # 创建一个未来日期的索引,用于保存预测数据 + future_index = pd.date_range(start=input_data.index.max() + + pd.Timedelta(days=1), + periods=steps) + + # 创建一个用于保存预测外生变量的空数据帧 + df_exog = pd.DataFrame(index=future_index) + + # 循环每个外生变量,使用ARIMA模型进行训练和预测,然后将预测值保存到df_exog中 + for exog in exog_columns: + pred = train_ARIMA_model(endog=input_data[exog], steps=steps) + df_exog[exog] = pred + + # 使用ARIMA模型对目标变量进行训练和预测,注意这里将df_exog作为预测阶段的外生变量传入 + pred = train_ARIMA_model(endog=input_data[forecast_target], + exog=input_data[exog_columns], + exog_pred=df_exog[exog_columns], + steps=steps, + information_criterion='bic') + + # 根据预测值创建一个新的数据帧,用于保存预测的目标变量 + forecast_df = pd.DataFrame(pred, + index=future_index, + columns=[forecast_target]) + + return forecast_df \ No newline at end of file diff --git a/app_test/Not in a section.py b/app_test/Not in a section.py new file mode 100644 index 0000000..4287ca8 --- /dev/null +++ b/app_test/Not in a section.py @@ -0,0 +1 @@ +# \ No newline at end of file diff --git a/app_test/RF.py b/app_test/RF.py new file mode 100644 index 0000000..6601c0d --- /dev/null +++ b/app_test/RF.py @@ -0,0 +1,118 @@ +from typing import List + +import numpy as np +import pandas as pd +from sklearn.ensemble import RandomForestRegressor + + +def random_forest_model(train_data: pd.DataFrame, + forecast_target: str, + exog_columns: List[str], + future_data: pd.DataFrame, + steps: int = 20) -> pd.DataFrame: + """ + 使用随机森林模型根据给定的特征和目标数据进行训练,并预测未来数据。 + + Args: + train_data (pd.DataFrame): 训练数据集。 + forecast_target (str): 训练数据集中的目标列的列名。 + exog_columns (List[str): 训练数据集用于预测的特征列名的列表。 + future_data (pd.DataFrame): 存储未来预测所用的外生变量的数据集。 + steps (int, optional, default=20): 要进行预测的天数。 + + Returns: + pd.DataFrame: 存储预测结果的数据表。 + """ + # 制作输入特征和目标变量 + X = train_data[exog_columns].values + y = train_data[forecast_target].values + X_test = future_data[exog_columns].values + + model = RandomForestRegressor(n_estimators=1200, + max_depth=8, + min_samples_split=2, + random_state=0) + + model.fit(X, y) + + pred = model.predict(X_test[-steps:]) + + forecast_df = pd.DataFrame( + pred, + index=pd.date_range(start=train_data.index.max() + + pd.Timedelta(days=1), + periods=steps), + columns=[forecast_target]) + + return forecast_df + + +def forecast_future(data: np.ndarray, steps: int = 20) -> List: + """ + 使用随机森林预测未来的数据。 + + Args: + data (np.ndarray): 已知的用于预测的数据。 + steps (int, optional, default=20): 要进行预测的天数。 + + Returns: + List: 存放预测结果的列表。 + """ + # 制作输入特征和目标变量 + X = data[:-1].reshape(-1, 1) + y = data[1:] + X_test = [y[-1]] + + # 创建和训练随机森林模型 + model = RandomForestRegressor(n_estimators=1200, + max_depth=8, + min_samples_split=2, + random_state=0) + + model.fit(X, y) + + # 创建一个列表保存预测结果 + pred = [] + + # 迭代预测下一个数据点 + for _ in range(steps): + y_pred = model.predict(np.array([X_test[-1]]).reshape(-1, 1)) + pred.append(y_pred) + + # 将预测的数据点添加到下一轮的输入 + X_test.append(y_pred) + return pred + + +def RF_run(input_data: pd.DataFrame, + forecast_target: str, + exog_columns: List[str], + steps: int = 20) -> pd.DataFrame: + """ + 执行数据读取、预处理、模型训练、预测并绘图等一系列步骤的主函数。 + + Args: + input_data (pd.DataFrame): 存储原始数据的DataFrame。 + forecast_target (str): 需要被预测的目标列名。 + exog_columns (List[str]): 特征列名的列表。 + steps (int, optional, default=20): 需要进行预测的天数。 + + Returns: + pd.DataFrame: 存储预测结果的数据表。 + """ + # 创建一个未来日期的索引,用于保存预测数据 + future_index = pd.date_range(start=input_data.index.max() + + pd.Timedelta(days=1), + periods=steps) + + # 创建一个用于保存预测外生变量的空数据帧 + df_exog = pd.DataFrame(index=future_index) + + for exog in exog_columns: + pred = forecast_future(input_data[exog].values, steps=steps) + df_exog[exog] = pred + + df_processed = random_forest_model(input_data, forecast_target, + exog_columns, df_exog, steps) + + return df_processed \ No newline at end of file diff --git a/app_test/VAR.py b/app_test/VAR.py new file mode 100644 index 0000000..d6660f4 --- /dev/null +++ b/app_test/VAR.py @@ -0,0 +1,94 @@ +import json +from typing import List + +import numpy as np +# from statsmodels.tsa.api import VAR +import pandas as pd +import statsmodels.api as sm + + +def convert_timestamp_index(data: pd.DataFrame, + to_period: bool) -> pd.DataFrame: + """ + 根据to_period参数,选择将数据的时间索引转换为DatetimeIndex或PeriodIndex。 + + Args: + data (pd.DataFrame): 输入的数据。 + to_period (bool): 如果为True,则将DatetimeIndex转换为PeriodIndex; + 如果为False,则将PeriodIndex转换为DatetimeIndex。 + + Returns: + pd.DataFrame: 索引被转换后的数据。 + """ + if to_period: + data.index = pd.DatetimeIndex(data.index).to_period('D') + else: + data.index = data.index.to_timestamp() + + return data + + +def train_VAR_model(data: pd.DataFrame, max_lags: int = 30): + """ + 利用输入的时间序列数据训练VAR模型,通过比较BIC值确定最优滞后阶数。 + + Args: + data (pd.DataFrame): 用于模型训练的时间序列数据。 + max_lags (int, default=30): 最大滞后阶数,默认为 30。 + + Returns: + VARResultsWrapper: 训练得到的VAR模型。 + """ + model = sm.tsa.VAR(data) + criteria = [] + lags = range(1, max_lags + 1) + + # 通过比较每个滞后阶数模型的BIC值,选择最优滞后阶数 + for lag in lags: + result = model.fit(maxlags=lag) + criteria.append(result.bic) + + # 使用最优滞后阶数再次训练模型 + best_lag = lags[criteria.index(min(criteria))] + results = model.fit(maxlags=best_lag) + + return results + + +def VAR_run(input_data: pd.DataFrame, + forecast_target: str, + _: List[str], + steps: int = 20) -> pd.DataFrame: + """ + 运行函数,执行一系列步骤,包括索引转换、训练模型、数据预测。 + + Args: + input_data (pd.DataFrame): 输入的DataFrame数据。 + forecast_target (str): 需要被预测的目标变量的列名。 + _ (List[str]): 占位参数,用于保持和其他模型函数的接口一致性。 + steps (int, default=20): 预测步数。 + + Returns: + pd.DataFrame: 预测结果的DataFrame对象。 + """ + input_data = input_data.replace([np.inf, -np.inf], np.nan).dropna() + # 将DataFrame对象的时间索引转换为PeriodIndex + input_data = convert_timestamp_index(input_data, to_period=True) + # 添加正则化项以确保协方差矩阵正定 + input_data += np.random.normal(0, 1e-10, input_data.shape) + # 训练 VAR 模型 + model = train_VAR_model(input_data, max_lags=10) + + # 将DataFrame对象的时间索引转回原样 + input_data = convert_timestamp_index(input_data, to_period=False) + + # 利用VAR模型进行预测 + pred = model.forecast(input_data.values[-model.k_ar:], steps=steps) + forecast_df = pd.DataFrame( + pred, + index=pd.date_range(start=input_data.index.max() + + pd.Timedelta(days=1), + periods=steps), + columns=input_data.columns) + + return forecast_df[forecast_target] diff --git a/app_test/__init__.py b/app_test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app_test/add_fund_data.py b/app_test/add_fund_data.py new file mode 100644 index 0000000..b289944 --- /dev/null +++ b/app_test/add_fund_data.py @@ -0,0 +1,49 @@ +import os + +import pandas as pd +# add_fund_data.py +from app_test.tiantian_jijin_spider import get_fund_data_by_code + + +def normalize_df(df): + """ + 对 DataFrame 对象进行最小最大标准化。 + + Args: + df (DataFrame): 要进行标准化的 DataFrame 对象。 + + Returns: + df_normalized (DataFrame): 进行最小最大标准化后的 DataFrame 对象。 + """ + # 如果列的数据类型是布尔值、有符号整型、无符号整型、浮点数或复数浮点数的话,就进行最大最小标准化,否则保留原列的数据 + df_normalized = df.apply(lambda x: (x - x.min()) / (x.max() - x.min()) + if x.dtype.kind in 'biufc' else x) + + return df_normalized +def add_fund_data(fund_code): + df = pd.read_csv('filled_row_data.csv') + # print(df) + fund_data = get_fund_data_by_code(fund_code) + + # print('基金数据') + + + # print(fund_data) + #将需要预测的因变量基金数据添加到预处理后数据 + # del fund_data['id'] # 删除 'id' 列 + df_merged = pd.merge(df, fund_data, how='inner', on='date') + df_merged['date'] = pd.to_datetime(df_merged['date']) + df_merged.set_index('date', inplace=True) + # print(type(df_merged.index.max())) + + # print('开始保存数据') + # df_merged.to_csv('row_data.csv',encoding='utf-8') + # 对缺失值进行线性插值(其他方法?多项插值?) + df_merged = df_merged.interpolate() + + # 如果有剩余的NaN值,删除这些行 + df_merged.dropna(inplace=True) + # df_merged = normalize_df(df_merged) + # print(df_merged) + print('添加基金数据成功') + return df_merged diff --git a/app_test/admin.py b/app_test/admin.py new file mode 100644 index 0000000..8c38f3f --- /dev/null +++ b/app_test/admin.py @@ -0,0 +1,3 @@ +from django.contrib import admin + +# Register your models here. diff --git a/app_test/apps.py b/app_test/apps.py new file mode 100644 index 0000000..678ae9f --- /dev/null +++ b/app_test/apps.py @@ -0,0 +1,10 @@ +# myapp/apps.py +from django.apps import AppConfig + +class AppTestConfig(AppConfig): + name = 'app_test' + + def ready(self): + # from .tasks import setup_periodic_tasks + # setup_periodic_tasks() + pass \ No newline at end of file diff --git a/app_test/beijing_zhoubao_spider.py b/app_test/beijing_zhoubao_spider.py new file mode 100644 index 0000000..633851e --- /dev/null +++ b/app_test/beijing_zhoubao_spider.py @@ -0,0 +1,148 @@ +import asyncio +import os +import random +import re +import time +from datetime import datetime, timedelta, date +from multiprocessing.pool import ThreadPool + +import django +import matplotlib.pyplot as plt +import pandas as pd +import requests +from django.db import IntegrityError +from lxml import etree +from pylab import mpl + +from .models import BeijingWeekData +from .user_agents_pool import agent_list # 确保 user_agents_pool.py 文件在当前目录,并包含 agent_list + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'liugan_yuce.liugan_yuce.settings') +django.setup() + +mpl.rcParams["font.sans-serif"] = ["SimHei"] +mpl.rcParams["axes.unicode_minus"] = False + +class GetBeijingGanranShuju(object): + def __init__(self): + user_agent = random.choice(agent_list) + self.headers = { + "User-Agent": user_agent, + } + self.data = [] + self.link_list_2023 = [] + self.link_list_2024 = [] + + def get_Link_2023(self, url): + response = requests.get(url=url, headers=self.headers) + time.sleep(random.uniform(1, 3)) + html = response.content.decode("utf-8") + link_2023 = re.findall('', html) + for i in link_2023: + url_head = "https://www.bjcdc.org/" + i = url_head + i + self.link_list_2023.append(i) + return self.link_list_2023 + + def get_Link_2024(self, url): + response = requests.get(url=url, headers=self.headers) + time.sleep(random.uniform(1, 3)) + html = response.content.decode("utf-8") + link_2024 = re.findall('', html) + for i in link_2024: + url_head = "https://www.bjcdc.org/" + i = url_head + i + self.link_list_2024.append(i) + return self.link_list_2024 + + def get_content_2023(self, link): + response = requests.get(url=link, headers=self.headers) + import time + time.sleep(random.uniform(1, 3)) + html = response.content.decode("utf-8") + number_list = re.findall(r'(\d+)例', html, re.DOTALL) + number = number_list[0] if number_list else '' + time_list = re.findall(r'(\d+月\d+日至2023年\d+月\d+日)', html) + if time_list: + time_str = time_list[0] + time1 = re.match(r'\d+月\d+日?', time_str).group() + month_number = re.match(r'\d{1,2}', time1).group() + day_number = re.findall(r'月(\d{1,2})', time1)[0] + time = f'2023-{int(month_number):02d}-{int(day_number):02d}' + if number.isdigit(): + self.data.append([time, number]) + + def get_content_2024(self, link): + response = requests.get(url=link, headers=self.headers) + import time + time.sleep(random.uniform(1, 3)) + html = response.content.decode("utf-8") + if '周' not in html: + number_list = re.findall(r'(\d+)例', html, re.DOTALL) + number = number_list[0] if number_list else '' + time_list = re.findall(r'(\d+年\d+月)', html) + if time_list: + time = time_list[0] + if number.isdigit(): + self.month_data.append([time, number]) + +def get_beijing_zhoubao(): + # 创建获取 获取北京传染病数据 类的实例 + get_beijing_ganran_shuju = GetBeijingGanranShuju() + url_1 = ['https://www.bjcdc.org/cdcmodule/jkdt/yqbb/index.shtml'] + url_list2 = [f'https://www.bjcdc.org/cdcmodule/jkdt/yqbb/index_{i}.shtml' for i in range(2, 5)] + url_list = url_1 + url_list2 + + # 2023 + for url in url_list: + get_beijing_ganran_shuju.get_Link_2023(url) + + # 使用多进程处理每个块 + pool = ThreadPool(100) + pool.map(get_beijing_ganran_shuju.get_content_2023, reversed(get_beijing_ganran_shuju.link_list_2023)) + pool.close() + pool.join() + + # 2024 + get_beijing_ganran_shuju.month_data = [] + for url in url_list: + get_beijing_ganran_shuju.get_Link_2024(url) + for x in reversed(get_beijing_ganran_shuju.link_list_2024): + get_beijing_ganran_shuju.get_content_2024(x) + + df = pd.DataFrame(get_beijing_ganran_shuju.data, columns=['日期', '感染数量']) + df = df[df['日期'] != '2023-12-26'] + df['日期'] = pd.to_datetime(df['日期']) + df_week = df.sort_values(by='日期') + from datetime import date + today = date.today() + start_date = datetime(2024, 1, 2) + end_date = datetime.now() + + dates = [] + while start_date <= end_date: + dates.append(start_date) + start_date += timedelta(days=7) + + infections = {datetime.strptime(month, "%Y年%m月").strftime("%Y-%m"): int(int(total) / 4) for month, total in get_beijing_ganran_shuju.month_data} + + date_infections = [] + for date in dates: + month_key = date.strftime("%Y-%m") + if month_key in infections: + date_infections.append([date, infections[month_key]]) + + month_df = pd.DataFrame(date_infections, columns=['日期', '感染数量']) + df = pd.concat([df_week, month_df]) + df = df.rename(columns={'日期': 'date', '感染数量': 'beijing_number'}) + print(df) + + converted_data = df.values.tolist() + for data in converted_data: + obj, created = BeijingWeekData.objects.get_or_create(date=data[0], defaults={'infection_number': data[1]}) + if created: + print(f"Added new record for date {data[0]} with infections {data[1]}") + else: + print(f"Record for date {data[0]} already exists.") + print('成功载入数据库') + diff --git a/app_test/data_merged.py b/app_test/data_merged.py new file mode 100644 index 0000000..6390d62 --- /dev/null +++ b/app_test/data_merged.py @@ -0,0 +1,71 @@ +import numpy as np +import pandas as pd +from pylab import mpl + +mpl.rcParams["font.sans-serif"] = ["SimHei"] +mpl.rcParams["axes.unicode_minus"] = False + +df_baidu = pd.read_csv('../data/baidu_index.csv',encoding = 'utf-8')# 百度流感指数 +df_beijing = pd.read_csv('../data/beijin_zhoubao.csv',encoding = 'utf-8')# 北京传染病周报 +df_liugan = pd.read_csv('../data/liugan_zhoubao.csv',encoding = 'utf-8')# 流感周报 +df_hx = pd.read_csv('../data/hx_jijin_data.csv',encoding = 'utf-8')# 流感基金——华商医药医疗行业 +df_gf = pd.read_csv('../data/gf_jijin_data.csv',encoding = 'utf-8')# 流感基金——广发创新医疗两年持有混合 +# 确保日期列是日期类型 +df_baidu['date'] = pd.to_datetime(df_baidu['date']) +df_beijing['date'] = pd.to_datetime(df_beijing['date']) +df_liugan['date'] = pd.to_datetime(df_liugan['date']) +df_hx['date'] = pd.to_datetime(df_hx['date']) +df_gf['date'] = pd.to_datetime(df_gf['date']) +df1 = df_baidu +df2 = df_beijing +df3 = df_liugan +df4 = df_hx +df5 = df_gf +# 创建一个完整的日期范围 +all_dates = pd.date_range(start=min(df1['date'].min(), df2['date'].min(), df3['date'].min()), + end=max(df1['date'].max(), df2['date'].max(), df3['date'].max())) +# 重新索引每个DataFrame以包括所有日期 +df1 = df1.set_index('date').reindex(all_dates).ffill().reset_index().rename(columns={'index': 'date'}) +df2 = df2.set_index('date').reindex(all_dates).ffill().reset_index().rename(columns={'index': 'date'}) +df3 = df3.set_index('date').reindex(all_dates).ffill().reset_index().rename(columns={'index': 'date'}) +df4 = df4.set_index('date').reindex(all_dates).ffill().reset_index().rename(columns={'index': 'date'}) +df5 = df5.set_index('date').reindex(all_dates).ffill().reset_index().rename(columns={'index': 'date'}) +df1.drop(columns=['Unnamed: 0'], inplace=True) +df2.drop(columns=['Unnamed: 0'], inplace=True) +df3.drop(columns=['Unnamed: 0'], inplace=True) +df4.drop(columns=['Unnamed: 0'], inplace=True) +df5.drop(columns=['Unnamed: 0'], inplace=True) +# 合并数据集 +df_merged = df1.merge(df2, on='date', how='outer').merge(df3, on='date', how='outer').merge(df4, on='date', how='outer').merge(df5, on='date', how='outer') +df_merged = df_merged[['date', 'liugan_index', 'beijing_number', 'infection_number','hx_jijin_data','gf_jijin_data']] + +# 输出合并后的DataFrame +# print(df_merged.head(20)) +#缺失值处理 +# df = df_merged.dropna(how= 'any') +# 确保'date'列是日期格式,并设置为索引 +df_merged['date'] = pd.to_datetime(df_merged['date']) +df_merged.set_index('date', inplace=True) + +# 只对非日期列转换数据类型 +numerical_columns = df_merged.columns.difference(['date']) # 排除'date'列 +df_merged[numerical_columns] = df_merged[numerical_columns].astype(float) + +# 确保数据类型正确,并查找是否有NaN或inf值 +df_merged = df_merged.astype(float) +print("Initial NaN or Inf check:", df_merged.isin([np.inf, -np.inf]).sum(), df_merged.isna().sum()) + +# 处理NaN值和无穷大值 +df_merged.replace([np.inf, -np.inf], np.nan, inplace=True) +df_merged.ffill() # 使用前向填充处理NaN值 +df_merged.dropna(inplace=True) # 如果有剩余的NaN值,删除这些行 +df_merged.to_csv('../data/merged_data.csv',encoding='utf-8') +from sklearn.preprocessing import MinMaxScaler +scaler = MinMaxScaler() +# 选择需要归一化的列 +columns_to_scale = ['liugan_index', 'beijing_number', 'infection_number','hx_jijin_data','gf_jijin_data'] +# 对选定的列进行归一化处理 +df_merged[columns_to_scale] = scaler.fit_transform(df_merged[columns_to_scale]) +# 查看归一化后的数据 +print(df_merged.head()) +df_merged.to_csv('../data/merged_data.csv',index=True,encoding = 'utf-8') diff --git a/app_test/deley_test.py b/app_test/deley_test.py new file mode 100644 index 0000000..281af09 --- /dev/null +++ b/app_test/deley_test.py @@ -0,0 +1,2 @@ +from app_test.tasks import my_scheduled_task +my_scheduled_task.delay() diff --git a/app_test/forms.py b/app_test/forms.py new file mode 100644 index 0000000..bcfa66b --- /dev/null +++ b/app_test/forms.py @@ -0,0 +1,39 @@ +from django import forms +from django.contrib.auth import get_user_model + +from .models import CaptchaModel, Fund + +User = get_user_model() +class FundForm(forms.ModelForm): + class Meta: + model = Fund + fields = ['fund_id', 'fund_name'] +class RegisterForm(forms.Form): + username = forms.CharField(max_length=20,min_length=2,error_messages={ + 'required':'请输入用户名', + 'max_length':'用户长度在2~20之间', + 'min_length':'用户长度在2~20之间' + }) + email = forms.EmailField(error_messages={'required':'请输入邮箱','invalid':'请输入一个正确的邮箱!'}) + password = forms.CharField(max_length=20,min_length=6) + + def clean_email(self): + email = self.cleaned_data.get('email') + exists = User.objects.filter(email=email).exists() + if exists: + raise forms.ValidationError('邮箱已经被注册') + return email + def clean_captcha(self): + captcha = self.cleaned_data_get('captcha') + email = self.cleaned_data_get('email') + creat_time = self.cleaned_data_get('creat_time') + captcha_model = Captcha.objects.filter(email=email,captcha=captcha).first() + if not captcha_model: + raise foroms.ValidationError('验证码错误') + return captcha + +class LoginForm(forms.Form): + email = forms.EmailField(error_messages={"required": '请传入邮箱!', 'invalid': '请传入一个正确的邮箱!'}) + password = forms.CharField(max_length=20, min_length=6) + remember = forms.IntegerField(required=False) + diff --git a/app_test/get_baidu_index.py b/app_test/get_baidu_index.py new file mode 100644 index 0000000..e8172ee --- /dev/null +++ b/app_test/get_baidu_index.py @@ -0,0 +1,129 @@ +import random +import re +import time +from datetime import datetime, timedelta, date + +import pandas as pd +import requests +from pylab import mpl + +from .models import BaiduData +from .user_agents_pool import * + +mpl.rcParams["font.sans-serif"] = ["SimHei"] +mpl.rcParams["axes.unicode_minus"] = False + +class DownloadBaiDuIndex(object): +#创建一个类来下载百度指数 + def __init__(self, cookie): + self.cookie = cookie + #配置请求头 + self.headers = { + "Connection": "keep-alive", + "Accept": "application/json, text/plain, */*", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36", + "Sec-Fetch-Site": "same-origin", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Dest": "empty", + "Referer": "https://index.baidu.com/v2/main/index.html", + "Accept-Language": "zh-CN,zh;q=0.9", + 'Cookie': self.cookie, + "Host": "index.baidu.com", + "X-Requested-With": "XMLHttpRequest", + "Cipher-Text": "1656572408684_1656582701256_Nvm1pABkNsfD7V9VhZSzzFiFKylr3l5NR3YDrmHmH9yfFicm+Z9kmmwKVqVV6unvzAEh5hgXmgelP+OyOeaK8F21LyRVX1BDjxm+ezsglwoe1yfp6lEpuvu5Iggg1dz3PLF8e2II0e80ocXeU0jQFBhSbnB2wjhKl57JggTej12CzuL+h9eeVWdaMO4DSBWU2XX6PfbN8pv9+cdfFhVRHCzb0BJBU3iccoFczwNQUvzLn0nZsu0YPtG5DxDkGlRlZrCfKMtqKAe1tXQhg3+Oww4N3CQUM+6A/tKZA7jfRE6CGTFetC7QQyKlD7nxabkQ5CReAhFYAFAVYJ+sEqmY5pke8s3+RZ6jR7ASOih6Afl35EArbJzzLpnNPgrPCHoJiDUlECJveul7P5vvXl/O/Q==", + + } + def decrypt(self, ptbk, index_data): + n = len(ptbk) // 2 + a = dict(zip(ptbk[:n], ptbk[n:])) + return "".join([a[s] for s in index_data]) + def get_index_data_json(self, keys, start=None, end=None): + words = [[{"name": key, "wordType": 1}] for key in keys] + words = str(words).replace(" ", "").replace("'", "\"") + url = f'http://index.baidu.com/api/SearchApi/index?area=0&word={words}&area=0&startDate={start}&endDate={end}' + res = requests.get(url, headers=self.headers) + html = res.content.decode("UTF-8") + data = res.json()['data'] + uniqid = data['uniqid'] + url = f'http://index.baidu.com/Interface/ptbk?uniqid={uniqid}' + # print(url) + res = requests.get(url, headers=self.headers) + html2 = res.content.decode("UTF-8") + time.sleep(3) + ptbk = res.json()['data'] + result = {} + result["startDate"] = start + result["endDate"] = end + for userIndexe in data['userIndexes']: + name = userIndexe['word'][0]['name'] + tmp = {} + index_all = userIndexe['all']['data'] + index_all_data = [int(e) for e in self.decrypt(ptbk, index_all).split(",")] + tmp["all"] = index_all_data + index_pc = userIndexe['pc']['data'] + index_pc_data = [int(e) for e in self.decrypt(ptbk, index_pc).split(",")] + tmp["pc"] = index_pc_data + index_wise = userIndexe['wise']['data'] + index_wise_data = [int(e) + for e in self.decrypt(ptbk, index_wise).split(",")] + tmp["wise"] = index_wise_data + result[name] = tmp + return result + def GetIndex(self, keys, start=None, end=None): + today = date.today() + if start is None: + start = str(today - timedelta(days=8)) + if end is None: + end = str(today - timedelta(days=2)) + try: + raw_data = self.get_index_data_json(keys=keys, start=start, end=end) + raw_data = pd.DataFrame(raw_data[keys[0]]) + raw_data.index = pd.date_range(start=start, end=end) + except Exception as e: + print(e) + raw_data = pd.DataFrame({'all': [], 'pc': [], 'wise': []}) + # 分别表示总计,PC端,移动端 + finally: + return raw_data + +def get_baidu_index(): + cookie = 'BIDUPSID=84B8FDC3134DE2D8E0E6B86E2BFCC3DC; PSTM=1697213335; BAIDUID=84B8FDC3134DE2D8E0E6B86E2BFCC3DC:SL=0:NR=10:FG=1; BAIDUID_BFESS=84B8FDC3134DE2D8E0E6B86E2BFCC3DC:SL=0:NR=10:FG=1; Hm_lvt_d101ea4d2a5c67dab98251f0b5de24dc=1701483117; BDUSS=RUU3ZtM0RwcU9VeW0zV0ltMGhWZXNvd3hoMXc3YmtoZmxOOXktTDNFM3JMNUpsRUFBQUFBJCQAAAAAAQAAAAEAAADwtxh-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOuiamXromplSH; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a04514997999zSyIXXcI1QTeZqm4c8hyxlWksvkordeK7x1ZPceY2CR3NLufUujm7MOZ3p6TYUaUvd3Qjet3M3JcQfM5hy8%2FuP9HNu4dCG7B6RoS3S4L25PQZlnh3joEA0cArzaShqjtNyIlDOFD7nF4m%2FHL%2FxUXMnks0IYh6ZyO0xZ1iCY3pJruPDK3dBKJPJ%2BTsLIUPckisDLv5o4FBynumqVmNrIcRJauvv%2BcQtioTBjGMshtfwaZjDT2WCz713NtlH6uxabBdf8gRHMu6r8uSWjXKPG3dAflk5ycDG%2F1BoioLYK697k%3D91877884685963653296273632513192; __cas__rn__=451499799; __cas__st__212=b5f51a7b5b20cb36d3ced6764c8b0e567b436d1a2aa46e1f861833387e9d43267ac11419a4d630081274b162; __cas__id__212=51862268; CPTK_212=1671659797; CPID_212=51862268; bdindexid=473uetvtav5o3d1jfb3m9s3d34; RT="z=1&dm=baidu.com&si=0751b751-3767-4525-9566-4b5f1cd26e3a&ss=lpnhlcxe&sl=8&tt=fr3&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf"; Hm_lpvt_d101ea4d2a5c67dab98251f0b5de24dc=1701490081; ab_sr=1.0.1_MjQ2ODNmNmI4NzI5MzFhZDAxYzIzZDQzYmMyZDAwOTZiYWE5NDY4OGQxMDNkYzA0NGM4OGU1ZDk5YjZmYjdkMTkyNTYxMDJiZmVlMjllNGU1MWQ1YjgwYTAzZGQxMWFkYzEyMDQ3ZjYxMThkNWI1NTg1ZTliOWVmYTQ1M2E3NjhmMDUzNTllNjU3YzYwNDlhOTU0ODRhMzJlZDAwMWY5Yg==; BDUSS_BFESS=RUU3ZtM0RwcU9VeW0zV0ltMGhWZXNvd3hoMXc3YmtoZmxOOXktTDNFM3JMNUpsRUFBQUFBJCQAAAAAAQAAAAEAAADwtxh-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOuiamXromplSH' + # 初始化一个实例 + downloadbaiduindex = DownloadBaiDuIndex(cookie=cookie) + # key = input('请输入关键词') + key = '流感' + # 获取当天时间 + + # from datetime import date + today = str(date.today()) + data = downloadbaiduindex.get_index_data_json(keys=[key], start='2023-01-01', end=today) + liugan_data = (data['流感']['all']) + + # 设定起始日期和终止日期 + start_date = date(2023, 1, 1) + end_date = datetime.now().date() + timedelta(days=7) + + # 创建日期列表,间隔为一周 + date_list = [] + current_date = start_date + while current_date <= end_date: + date_list.append(current_date) + current_date += timedelta(weeks=1) # 每次增加一周 + date_list = date_list[:len(liugan_data)] + + df = pd.DataFrame({ + 'date': date_list, + 'liugan_index': liugan_data + }) + df = df.drop(df.index[-1]) + print(df) + converted_data = df.values.tolist() + for data in converted_data: + # 使用get_or_create来避免重复数据 + obj, created = BaiduData.objects.get_or_create(date=data[0], defaults={'liugan_index': data[1]}) + if created: + print(f"Added new record for date {data[0]} with infections {data[1]}") + else: + print(f"Record for date {data[0]} already exists.") + print('成功载入数据库') +# 调用函数 diff --git a/app_test/liugan_zhoubao_spider.py b/app_test/liugan_zhoubao_spider.py new file mode 100644 index 0000000..a08b2cb --- /dev/null +++ b/app_test/liugan_zhoubao_spider.py @@ -0,0 +1,125 @@ +import datetime +import os +import random +import re +import time +from datetime import datetime +from multiprocessing.pool import ThreadPool + +import django +import requests +from app_test.models import LiuganWeekData +from django.db import IntegrityError +from lxml import etree +from tqdm import * + +from .user_agents_pool import * + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'liugan_yuce.liugan_yuce.settings') +django.setup() + +# 现在你可以安全地使用 Django 的模型和其他组件了 + +url_1=['https://ivdc.chinacdc.cn/cnic/zyzx/lgzb/index.htm'] +url_list2=[f'https://ivdc.chinacdc.cn/cnic/zyzx/lgzb/index_{i}.htm' for i in range(1,4)] +url_list=url_1+url_list2 + +user_Agent = random.choice(agent_list) +headers = { + "User-Agent": user_Agent, +} +def get_Link(url): + link_list = [] + response = requests.get(url=url, headers=headers) + time.sleep(1) + html = response.content.decode("utf-8") + tree = etree.HTML(html) + li_list = tree.xpath('/html/body/div[2]/div/div[1]/div/div[2]/ul/li') + # print(len(li_list)) + for table in li_list: + link = table.xpath("./span[1]/a/@href")[0] + link = link.replace('.','') + url_head = "https://ivdc.chinacdc.cn/cnic/zyzx/lgzb" + link = url_head + link + link = link.replace('htm','.htm') + link_list.append(link) + return link_list + + +def get_content(link): + response = requests.get(url=link, headers=headers) + time.sleep(1.5) + html = response.content.decode("utf-8") + # print(html) + tree = etree.HTML(html) + date = tree.xpath('/html/body/div[2]/div/div[1]/div/div[2]/div/div/div/p[1]/span/text()')[1] + # print(time) + year = tree.xpath('/html/body/div[2]/div/div[1]/div/div[2]/div/div/div/p[1]/span/span/text()')[0] + # print(year) + date = year+date + date = date.replace(')','') + date_format = '%Y年%m月%d日' + target_date = datetime.strptime(date, date_format) + # print(target_date) + start_time = '2023年2月18日' + start_date = datetime.strptime(start_time, date_format) + if target_date > start_date: + specific_number = re.search(r'(.?<=font-size: 10pt;\">|)(\d+)(?=起|起)', html) + number = specific_number.group(2) if specific_number else None + if number == None: + pattern = r'(\d+) + + + +
+ +
+ Official open source SVG icon library for Bootstrap with over 2,000 icons.
+
+ Explore Bootstrap Icons »
+
+
+ Bootstrap
+ ·
+ Themes
+ ·
+ Blog
+
+
添加基金
+ """, unsafe_allow_html=True) + + # 创建一个输入框,使用 placeholder 参数设置提示语 + fund_code = st.text_input('基金代码', placeholder='请输入基金代码', key='fund_input', on_change=validate_fund, + label_visibility='collapsed') + + # 显示反馈信息并在显示后重置 + if st.session_state['message']: + st.write(st.session_state['message']) + st.session_state['message'] = '' # 重置消息以避免重复显示 + +# 页面刷新逻辑 +if st.session_state['trigger_rerun']: + st.session_state['trigger_rerun'] = False + st.rerun() + + + + + + + + + + diff --git a/app_test/other_pages/page3.py b/app_test/other_pages/page3.py new file mode 100644 index 0000000..13bae68 --- /dev/null +++ b/app_test/other_pages/page3.py @@ -0,0 +1,276 @@ +# import pandas as pd +# import pandas_profiling +# import streamlit as st +# # from pydantic_settings import BaseSettings +# +# from streamlit_pandas_profiling import st_profile_report +# +# df = pd.read_csv("https://storage.googleapis.com/tf-datasets/titanic/train.csv") +# pr = df.profile_report() +# +# st_profile_report(pr) +import os + +import django +from django.conf import settings + +os.chdir('D:/python/djangoProject/test_Bootstrap') +# 设置 Django 环境变量 +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'test_Bootstrap.settings') +print('开始初始化') +# 强制初始化 Django +django.setup() +print("Django configured.") + +print("Starting Streamlit...") +import streamlit as st +from streamlit_vertical_slider import vertical_slider +from st_pages import Page, Section, show_pages, add_page_title, add_indentation +add_page_title() +add_indentation() + +import streamlit as st +import streamlit as st + +# 定义点击回调函数 +def reset_weights(): + st.session_state.slider_values = [32, 12, 43, 12, 12] + st.session_state.reset_trigger += 1 + +# 初始化 session state 中的键 +if 'slider_values' not in st.session_state: + st.session_state.slider_values = [32, 12, 43, 12, 12] +if 'reset_trigger' not in st.session_state: + st.session_state.reset_trigger = 0 +if 'fund_code' not in st.session_state: + st.session_state['fund_code'] = '' +col1, col2 = st.columns([0.8, 0.2]) + +with col1: + # 使用 HTML 和内联CSS来增加字体大小 + st.markdown(""" +基金预测
+ """, unsafe_allow_html=True) + # 创建一个输入框,使用 placeholder 参数设置提示语 + fund_code = st.text_input('基金代码', placeholder='请输入基金代码', key='fund_code', on_change=fund_predect, + label_visibility='collapsed') + print(fund_code) + +def result_visualization(date_js, data_js): + html_content = f""" + + + + + """ + + # 使用 Streamlit 的 HTML 函数将 HTML 内容嵌入页面中 + components.html(html_content, height=350) diff --git a/app_test/other_pages/page4.py b/app_test/other_pages/page4.py new file mode 100644 index 0000000..e69de29 diff --git a/app_test/other_pages/page5.py b/app_test/other_pages/page5.py new file mode 100644 index 0000000..8468b31 --- /dev/null +++ b/app_test/other_pages/page5.py @@ -0,0 +1,249 @@ +import os + +import django +from django.conf import settings + +os.chdir('D:/python/djangoProject/test_Bootstrap') +# 设置 Django 环境变量 +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'test_Bootstrap.settings') +# print('开始初始化') +# # 强制初始化 Django +django.setup() +# print("Django configured.") + +print("Starting Streamlit...") +import streamlit as st +from streamlit_vertical_slider import vertical_slider +from st_pages import Page, Section, show_pages, add_page_title, add_indentation +add_page_title() +add_indentation() + +import streamlit as st +import streamlit as st + +# 定义点击回调函数 +def reset_weights(): + st.session_state.slider_values = [32, 12, 43, 12, 12] + st.session_state.reset_trigger += 1 + +# 初始化 session state 中的键 +if 'slider_values' not in st.session_state: + st.session_state.slider_values = [32, 12, 43, 12, 12] +if 'reset_trigger' not in st.session_state: + st.session_state.reset_trigger = 0 +if 'fund_code' not in st.session_state: + st.session_state['fund_code'] = '' +col1, col2 = st.columns([0.8, 0.2]) + +with col1: + # 使用 HTML 和内联CSS来增加字体大小 + st.markdown(""" +`s get reset. However, we also reset the\n// bottom margin to use `rem` units instead of `em`.\n\np {\n margin-top: 0;\n margin-bottom: $paragraph-margin-bottom;\n}\n\n\n// Abbreviations\n//\n// 1. Add the correct text decoration in Chrome, Edge, Opera, and Safari.\n// 2. Add explicit cursor to indicate changed behavior.\n// 3. Prevent the text-decoration to be skipped.\n\nabbr[title] {\n text-decoration: underline dotted; // 1\n cursor: help; // 2\n text-decoration-skip-ink: none; // 3\n}\n\n\n// Address\n\naddress {\n margin-bottom: 1rem;\n font-style: normal;\n line-height: inherit;\n}\n\n\n// Lists\n\nol,\nul {\n padding-left: 2rem;\n}\n\nol,\nul,\ndl {\n margin-top: 0;\n margin-bottom: 1rem;\n}\n\nol ol,\nul ul,\nol ul,\nul ol {\n margin-bottom: 0;\n}\n\ndt {\n font-weight: $dt-font-weight;\n}\n\n// 1. Undo browser default\n\ndd {\n margin-bottom: .5rem;\n margin-left: 0; // 1\n}\n\n\n// Blockquote\n\nblockquote {\n margin: 0 0 1rem;\n}\n\n\n// Strong\n//\n// Add the correct font weight in Chrome, Edge, and Safari\n\nb,\nstrong {\n font-weight: $font-weight-bolder;\n}\n\n\n// Small\n//\n// Add the correct font size in all browsers\n\nsmall {\n @include font-size($small-font-size);\n}\n\n\n// Mark\n\nmark {\n padding: $mark-padding;\n background-color: var(--#{$prefix}highlight-bg);\n}\n\n\n// Sub and Sup\n//\n// Prevent `sub` and `sup` elements from affecting the line height in\n// all browsers.\n\nsub,\nsup {\n position: relative;\n @include font-size($sub-sup-font-size);\n line-height: 0;\n vertical-align: baseline;\n}\n\nsub { bottom: -.25em; }\nsup { top: -.5em; }\n\n\n// Links\n\na {\n color: rgba(var(--#{$prefix}link-color-rgb), var(--#{$prefix}link-opacity, 1));\n text-decoration: $link-decoration;\n\n &:hover {\n --#{$prefix}link-color-rgb: var(--#{$prefix}link-hover-color-rgb);\n text-decoration: $link-hover-decoration;\n }\n}\n\n// And undo these styles for placeholder links/named anchors (without href).\n// It would be more straightforward to just use a[href] in previous block, but that\n// causes specificity issues in many other styles that are too complex to fix.\n// See https://github.com/twbs/bootstrap/issues/19402\n\na:not([href]):not([class]) {\n &,\n &:hover {\n color: inherit;\n text-decoration: none;\n }\n}\n\n\n// Code\n\npre,\ncode,\nkbd,\nsamp {\n font-family: $font-family-code;\n @include font-size(1em); // Correct the odd `em` font sizing in all browsers.\n}\n\n// 1. Remove browser default top margin\n// 2. Reset browser default of `1em` to use `rem`s\n// 3. Don't allow content to break outside\n\npre {\n display: block;\n margin-top: 0; // 1\n margin-bottom: 1rem; // 2\n overflow: auto; // 3\n @include font-size($code-font-size);\n color: $pre-color;\n\n // Account for some code outputs that place code tags in pre tags\n code {\n @include font-size(inherit);\n color: inherit;\n word-break: normal;\n }\n}\n\ncode {\n @include font-size($code-font-size);\n color: var(--#{$prefix}code-color);\n word-wrap: break-word;\n\n // Streamline the style when inside anchors to avoid broken underline and more\n a > & {\n color: inherit;\n }\n}\n\nkbd {\n padding: $kbd-padding-y $kbd-padding-x;\n @include font-size($kbd-font-size);\n color: $kbd-color;\n background-color: $kbd-bg;\n @include border-radius($border-radius-sm);\n\n kbd {\n padding: 0;\n @include font-size(1em);\n font-weight: $nested-kbd-font-weight;\n }\n}\n\n\n// Figures\n//\n// Apply a consistent margin strategy (matches our type styles).\n\nfigure {\n margin: 0 0 1rem;\n}\n\n\n// Images and content\n\nimg,\nsvg {\n vertical-align: middle;\n}\n\n\n// Tables\n//\n// Prevent double borders\n\ntable {\n caption-side: bottom;\n border-collapse: collapse;\n}\n\ncaption {\n padding-top: $table-cell-padding-y;\n padding-bottom: $table-cell-padding-y;\n color: $table-caption-color;\n text-align: left;\n}\n\n// 1. Removes font-weight bold by inheriting\n// 2. Matches default `