You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Influenza_fund_linkage_system/app_test/tiantian_jijin_spider.py

179 lines
11 KiB

import os
os.chdir('D:/python/djangoProject/test_Bootstrap')
# 设置DJANGO_SETTINGS_MODULE环境变量
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'test_Bootstrap.settings')
import requests
import re
from multiprocessing.pool import ThreadPool
import pandas as pd
from .models import JijinData
import random
class JijinSpider():
def get_jijin_data(self,*args):
"""
获取某个基金某页的历史净值数据
:param fundCode:
:param page:
:return: list
"""
# print(args)
cookies = {
'qgqp_b_id': '5c08ebc12f489b4f5ba9e76c2539ce0b',
'emshistory': '%5B%2200005%22%2C%2200002%22%2C%2200002%E3%80%81%22%2C%2200001%22%5D',
'HAList': 'ty-0-300411-%u91D1%u76FE%u80A1%u4EFD%2Cty-0-399366-%u80FD%u6E90%u91D1%u5C5E%2Cty-116-00002-%u4E2D%u7535%u63A7%u80A1%2Cty-116-03119-GX%u4E9A%u6D32%u534A%u5BFC%u4F53%2Cty-116-00007-%u667A%u5BCC%u8D44%u6E90%u6295%u8D44%2Cty-116-00001-%u957F%u548C%2Cty-116-00016-%u65B0%u9E3F%u57FA%u5730%u4EA7%2Cty-0-301075-%u591A%u745E%u533B%u836F%2Cty-90-BK1042-%u533B%u836F%u5546%u4E1A%2Cty-1-601607-%u4E0A%u6D77%u533B%u836F',
'mtp': '1',
'ct': 'Rc8QhLQwVpXSsLuf4UOMLbPMtE9gFAEkMTisAatrxh1rv-WFWG9EC-2zw_WFCJnVfsaViwejVO4ziLTZig1GUptw6NORwx36yfzDu9g9zstYkLdwIWvQ-9QqGL-F5C1GCS7xhUtoBrFAibnr_-HA078LL8tr7yWiGM9V3ZmooC8',
'ut': 'FobyicMgeV54OLFNgnrRk4fT26HSX01NG2N55VZbVzZlqOMDJ-67DsHyCMk6G-yTMaqRhIAFuiYbVkK6Y-sYY8ghkJ3v9gyvUZyHWYpJnreP78yw4o-H8FNcTvUXmOj4KLsGaYuV1TAHltcdN0WDTy-YCOJ8OlzrX-MQbQc_CBvXfUYn10iBhXwvJY94XBkg4eOCJpu6Dok3ot9Xsr8flPIDz6f3KxJcIgnXZ7QpZKDMIvavpSunuMiR8Q5ezUD2y-JiBEgNkeoH_36wg0elojOfd5k61gTK',
'pi': '6293426663250936%3Bm6293426663250936%3B%E4%BA%89%E5%88%86%E5%A4%BA%E7%A7%92%E7%9A%84%E9%A3%8E%E8%BE%B02%3B4qqIkcy3NvmegD2EnE%2BsOg2O1jjgPTjDxX3du3GmlWaCk8fr0sJ%2FmubqRXtUqqRoZWsMMmMvcfSg1wNNX8p93XE3fanPRZvbcs7bYEjCeUqg5RMcJtmbM9jEifMzwRAAmCipwh9KbqrYLdkLenTwJYqOaG9qmaZ2qDmn2Pa66eitUxhH2q0aU0kerTnJCi2qJnM8Y0Oc%3Bz%2Bzk7gxq8gdHwxSGucOoQSvBZ44Uaf7Um0f7bFnTUgwLnxWm2OMnlrG9SZX6ezbrsEoqVVrOk%2FVRGekqxUH%2BufKtmb89UVNnA0x62lxu6z84Y8dT0sXAWUELHmWZf8cnumRIL8kPvuAcHSXq5P6pTC3OaxbBeQ%3D%3D',
'uidal': '6293426663250936%e4%ba%89%e5%88%86%e5%a4%ba%e7%a7%92%e7%9a%84%e9%a3%8e%e8%be%b02',
'sid': '',
'vtpst': '|',
'websitepoptg_api_time': '1715218615434',
'st_si': '46368340182479',
'EmFundFavorVersion': '1686749115372',
'EmFundFavorVersion2': '1686749115372',
'st_asi': 'delete',
'EMFUND0': 'null',
'st_pvi': '35290886003252',
'st_sp': '2023-12-17%2018%3A51%3A34',
'st_inirUrl': 'https%3A%2F%2Fcn.bing.com%2F',
'st_sn': '27',
'st_psi': '20240509100744555-112200305283-5067673963',
}
agent_list = [
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:87.0) Gecko/20100101 \
Firefox/87.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, li\
ke Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0",
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHT\
ML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',]
headers = {
'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Connection': 'keep-alive',
# 'Cookie': 'qgqp_b_id=5c08ebc12f489b4f5ba9e76c2539ce0b; emshistory=%5B%2200005%22%2C%2200002%22%2C%2200002%E3%80%81%22%2C%2200001%22%5D; HAList=ty-0-300411-%u91D1%u76FE%u80A1%u4EFD%2Cty-0-399366-%u80FD%u6E90%u91D1%u5C5E%2Cty-116-00002-%u4E2D%u7535%u63A7%u80A1%2Cty-116-03119-GX%u4E9A%u6D32%u534A%u5BFC%u4F53%2Cty-116-00007-%u667A%u5BCC%u8D44%u6E90%u6295%u8D44%2Cty-116-00001-%u957F%u548C%2Cty-116-00016-%u65B0%u9E3F%u57FA%u5730%u4EA7%2Cty-0-301075-%u591A%u745E%u533B%u836F%2Cty-90-BK1042-%u533B%u836F%u5546%u4E1A%2Cty-1-601607-%u4E0A%u6D77%u533B%u836F; mtp=1; ct=Rc8QhLQwVpXSsLuf4UOMLbPMtE9gFAEkMTisAatrxh1rv-WFWG9EC-2zw_WFCJnVfsaViwejVO4ziLTZig1GUptw6NORwx36yfzDu9g9zstYkLdwIWvQ-9QqGL-F5C1GCS7xhUtoBrFAibnr_-HA078LL8tr7yWiGM9V3ZmooC8; ut=FobyicMgeV54OLFNgnrRk4fT26HSX01NG2N55VZbVzZlqOMDJ-67DsHyCMk6G-yTMaqRhIAFuiYbVkK6Y-sYY8ghkJ3v9gyvUZyHWYpJnreP78yw4o-H8FNcTvUXmOj4KLsGaYuV1TAHltcdN0WDTy-YCOJ8OlzrX-MQbQc_CBvXfUYn10iBhXwvJY94XBkg4eOCJpu6Dok3ot9Xsr8flPIDz6f3KxJcIgnXZ7QpZKDMIvavpSunuMiR8Q5ezUD2y-JiBEgNkeoH_36wg0elojOfd5k61gTK; pi=6293426663250936%3Bm6293426663250936%3B%E4%BA%89%E5%88%86%E5%A4%BA%E7%A7%92%E7%9A%84%E9%A3%8E%E8%BE%B02%3B4qqIkcy3NvmegD2EnE%2BsOg2O1jjgPTjDxX3du3GmlWaCk8fr0sJ%2FmubqRXtUqqRoZWsMMmMvcfSg1wNNX8p93XE3fanPRZvbcs7bYEjCeUqg5RMcJtmbM9jEifMzwRAAmCipwh9KbqrYLdkLenTwJYqOaG9qmaZ2qDmn2Pa66eitUxhH2q0aU0kerTnJCi2qJnM8Y0Oc%3Bz%2Bzk7gxq8gdHwxSGucOoQSvBZ44Uaf7Um0f7bFnTUgwLnxWm2OMnlrG9SZX6ezbrsEoqVVrOk%2FVRGekqxUH%2BufKtmb89UVNnA0x62lxu6z84Y8dT0sXAWUELHmWZf8cnumRIL8kPvuAcHSXq5P6pTC3OaxbBeQ%3D%3D; uidal=6293426663250936%e4%ba%89%e5%88%86%e5%a4%ba%e7%a7%92%e7%9a%84%e9%a3%8e%e8%be%b02; sid=; vtpst=|; websitepoptg_api_time=1715218615434; st_si=46368340182479; EmFundFavorVersion=1686749115372; EmFundFavorVersion2=1686749115372; st_asi=delete; EMFUND0=null; EMFUND1=05-09%2009%3A49%3A02@%23%24%u534E%u590F%u6210%u957F%u6DF7%u5408@%23%24000001; EMFUND2=05-09%2009%3A53%3A36@%23%24%u5BCC%u56FD%u7CBE%u51C6%u533B%u7597%u6DF7%u5408A@%23%24005176; EMFUND3=05-09%2009%3A54%3A07@%23%24%u94F6%u6CB3%u533B%u836F%u6DF7%u5408A@%23%24011335; EMFUND4=05-09%2009%3A54%3A13@%23%24%u4E1C%u65B9%u7EA2%u533B%u7597%u5347%u7EA7%u80A1%u7968%u53D1%u8D77A@%23%24015052; EMFUND5=05-09%2009%3A57%3A40@%23%24%u5B9D%u76C8%u73B0%u4EE3%u670D%u52A1%u4E1A%u6DF7%u5408A@%23%24009223; EMFUND6=05-09%2009%3A57%3A51@%23%24%u4E1C%u65B9%u7EA2%u533B%u7597%u5347%u7EA7%u80A1%u7968%u53D1%u8D77C@%23%24015053; EMFUND7=05-09%2009%3A58%3A04@%23%24%u5E7F%u53D1%u521B%u65B0%u533B%u7597%u4E24%u5E74%u6301%u6709%u6DF7%u5408A@%23%24010731; EMFUND8=05-09%2009%3A58%3A56@%23%24%u5BCC%u56FD%u751F%u7269%u533B%u836F%u79D1%u6280%u6DF7%u5408A@%23%24006218; EMFUND9=05-09 09:59:24@#$%u534E%u5546%u533B%u836F%u533B%u7597%u884C%u4E1A%u80A1%u7968@%23%24008107; st_pvi=35290886003252; st_sp=2023-12-17%2018%3A51%3A34; st_inirUrl=https%3A%2F%2Fcn.bing.com%2F; st_sn=27; st_psi=20240509100744555-112200305283-5067673963',
'Referer': 'https://fundf10.eastmoney.com/',
'User-Agent': random.choice(agent_list)
}
params = {
'callback': 'jQuery183019015669101010957_1715220464680',
'fundCode': args[0][0],
'pageIndex': args[0][1],
'pageSize': '20',
'startDate': '',
'endDate': '',
'_': '1715220492762',
}
response = requests.get('https://api.fund.eastmoney.com/f10/lsjz', params=params, cookies=cookies, headers=headers)
pattern = r'"FSRQ":"(.*?)","DWJZ":"(.*?)"'
text = response.text
data_page = re.findall(pattern, text)
data_list = []
for data in reversed(data_page):
data = list(data)
data[1] = float(data[1])
data_list.append(data)
return data_list
def get_hx_data(self):
"""
获取华商医药医疗行业股票基金历史净值数据
:return: list of hx_data
"""
fundcode = '008107'
page_list = range(1,17)
hx_data = []
args_list = [(fundcode,i) for i in page_list]
# 使用多进程处理
pool = ThreadPool(100)
data_list = pool.map(self.get_jijin_data,reversed(args_list) )
pool.close()
pool.join()
for data in data_list:
hx_data+=data
# print(hx_data)
# 数据储存
# for data in hx_data:
# obj, created = HXData.objects.get_or_create(date=data[0], defaults={'hx_data': data[1]})
# if created:
# print(f"Added new record for date {data[0]} with infections {data[1]}")
# else:
# print(f"Record for date {data[0]} already exists.")
hx_data = pd.DataFrame(hx_data, columns=['date', '收盘'])
return hx_data
def get_gf_data(self):
"""
获取广发创新医疗两年持有混合基金历史净值数据
:return: list of hx_data
"""
fundcode = '010731'
page_list = range(1,17)
gf_data = []
args_list = [(fundcode,i) for i in page_list]
# 使用多进程处理
pool = ThreadPool(100)
data_list = pool.map(self.get_jijin_data,reversed(args_list) )
pool.close()
pool.join()
for data in data_list:
gf_data+=data
# print(gf_data)
# for data in gf_data:
# obj, created = GFData.objects.get_or_create(date=data[0], defaults={'gf_data': data[1]})
# if created:
# print(f"Added new record for date {data[0]} with infections {data[1]}")
# else:
# print(f"Record for date {data[0]} already exists.")
gf_data = pd.DataFrame(gf_data, columns=['date', '收盘'])
return gf_data
def get_tiantian_jijin_data():
jijin_spider = JijinSpider()
df1 = jijin_spider.get_gf_data()
df2 = jijin_spider.get_hx_data()
merged_df = df1.merge(df2,on = 'date',how = 'inner')#取交集
merged_df['stock_data'] = (merged_df['收盘_x']+merged_df['收盘_y'])/2
data = merged_df[['date','stock_data']]
data_list = data.values.tolist()
# print(data_list)
for data in data_list:
obj, created =JijinData.objects.get_or_create(date=data[0], defaults={'jijin_data': data[1]})
if created:
print(f"Added new record for date {data[0]} with infections {data[1]}")
else:
print(f"Record for date {data[0]} already exists.")
# get_tiantian_jijin_data()
# 需要预测的基金数据爬取
def get_fund_data_by_code(code):
spider = JijinSpider()
fundcode = code
page_list = range(1, 17)
fund_data = []
args_list = [(fundcode, i) for i in page_list]
# 使用多进程处理
pool = ThreadPool(100)
data_list = pool.map(spider.get_jijin_data, list(reversed(args_list)))
pool.close()
pool.join()
for data in data_list:
fund_data += data
# print(hx_data)
# 数据储存
# for data in hx_data:
# obj, created = HXData.objects.get_or_create(date=data[0], defaults={'hx_data': data[1]})
# if created:
# print(f"Added new record for date {data[0]} with infections {data[1]}")
# else:
# print(f"Record for date {data[0]} already exists.")
fund_data = pd.DataFrame(fund_data, columns=['date', 'fund_data'])
# print(fund_data)
return fund_data