添加了流感周报爬取脚本和用户代理池,用于从指定网站爬取流感周报数据。流感周报爬取2.py负责爬取数据并保存到CSV文件中,user_agents_pool.py提供了可供使用的用户代理池。dev_test
parent
941c9fe7e0
commit
442b502724
@ -0,0 +1,41 @@
|
|||||||
|
# 可供使用的user_agent池
|
||||||
|
agent_list = [
|
||||||
|
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:87.0) Gecko/20100101 \
|
||||||
|
Firefox/87.0",
|
||||||
|
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, li\
|
||||||
|
ke Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0",
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHT\
|
||||||
|
ML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
|
||||||
|
"Mozilla/5.0 (Linux; Android 7.0; SM-G950U Build/NRD90M) AppleWebK\
|
||||||
|
it/537.36 (KHTML, like Gecko) Chrome/62.0.3202.84 Mobile Safari/53\
|
||||||
|
7.36",
|
||||||
|
"Mozilla/5.0 (Linux; Android 8.0.0; SM-G965U Build/R16NW) AppleWeb\
|
||||||
|
Kit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.111 Mobile Safari/\
|
||||||
|
537.36",
|
||||||
|
"Mozilla/5.0 (Linux; Android 8.1.0; SM-T837A) AppleWebKit/537.36 (\
|
||||||
|
KHTML, like Gecko) Chrome/70.0.3538.80 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (Linux; U; en-us; KFAPWI Build/JDQ39) AppleWebKit/535\
|
||||||
|
.19 (KHTML, like Gecko) Silk/3.13 Safari/535.19 Silk-Accelerated=t\
|
||||||
|
rue",
|
||||||
|
"Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia \
|
||||||
|
550) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Mob\
|
||||||
|
ile Safari/537.36 Edge/14.14263",
|
||||||
|
"Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia \
|
||||||
|
950) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Mob\
|
||||||
|
ile Safari/537.36 Edge/14.14263",
|
||||||
|
"Mozilla/5.0 (Linux; Android 11; moto g power (2022)) AppleWebKit/\
|
||||||
|
537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36",
|
||||||
|
"Mozilla/5.0 (Linux; Android 6.0.1; Moto G (4)) AppleWebKit/537.36\
|
||||||
|
(KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36",
|
||||||
|
"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 10 Build/MOB31T) AppleWe\
|
||||||
|
bKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWeb\
|
||||||
|
Kit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.\
|
||||||
|
36",
|
||||||
|
"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKi\
|
||||||
|
t/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36\
|
||||||
|
",
|
||||||
|
"Mozilla/5.0 (Linux; Android 8.0.0; Nexus 5X Build/OPR4.170623.006\
|
||||||
|
) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile S\
|
||||||
|
afari/537.36",
|
||||||
|
]
|
@ -0,0 +1,131 @@
|
|||||||
|
import requests
|
||||||
|
import re
|
||||||
|
from multiprocessing.pool import ThreadPool
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def get_jijin_data(*args):
|
||||||
|
"""
|
||||||
|
获取某个基金某页的历史净值数据
|
||||||
|
:param fundCode:
|
||||||
|
:param page:
|
||||||
|
:return: list
|
||||||
|
"""
|
||||||
|
cookies = {
|
||||||
|
'qgqp_b_id': '5c08ebc12f489b4f5ba9e76c2539ce0b',
|
||||||
|
'emshistory':
|
||||||
|
'%5B%2200005%22%2C%2200002%22%2C%2200002%E3%80%81%22%2C%2200001%22%5D',
|
||||||
|
'HAList':
|
||||||
|
'ty-0-300411-%u91D1%u76FE%u80A1%u4EFD%2Cty-0-399366-%u80FD%u6E90%u91D1%u5C5E%2Cty-116-00002-%u4E2D%u7535%u63A7%u80A1%2Cty-116-03119-GX%u4E9A%u6D32%u534A%u5BFC%u4F53%2Cty-116-00007-%u667A%u5BCC%u8D44%u6E90%u6295%u8D44%2Cty-116-00001-%u957F%u548C%2Cty-116-00016-%u65B0%u9E3F%u57FA%u5730%u4EA7%2Cty-0-301075-%u591A%u745E%u533B%u836F%2Cty-90-BK1042-%u533B%u836F%u5546%u4E1A%2Cty-1-601607-%u4E0A%u6D77%u533B%u836F',
|
||||||
|
'mtp': '1',
|
||||||
|
'ct':
|
||||||
|
'Rc8QhLQwVpXSsLuf4UOMLbPMtE9gFAEkMTisAatrxh1rv-WFWG9EC-2zw_WFCJnVfsaViwejVO4ziLTZig1GUptw6NORwx36yfzDu9g9zstYkLdwIWvQ-9QqGL-F5C1GCS7xhUtoBrFAibnr_-HA078LL8tr7yWiGM9V3ZmooC8',
|
||||||
|
'ut':
|
||||||
|
'FobyicMgeV54OLFNgnrRk4fT26HSX01NG2N55VZbVzZlqOMDJ-67DsHyCMk6G-yTMaqRhIAFuiYbVkK6Y-sYY8ghkJ3v9gyvUZyHWYpJnreP78yw4o-H8FNcTvUXmOj4KLsGaYuV1TAHltcdN0WDTy-YCOJ8OlzrX-MQbQc_CBvXfUYn10iBhXwvJY94XBkg4eOCJpu6Dok3ot9Xsr8flPIDz6f3KxJcIgnXZ7QpZKDMIvavpSunuMiR8Q5ezUD2y-JiBEgNkeoH_36wg0elojOfd5k61gTK',
|
||||||
|
'pi':
|
||||||
|
'6293426663250936%3Bm6293426663250936%3B%E4%BA%89%E5%88%86%E5%A4%BA%E7%A7%92%E7%9A%84%E9%A3%8E%E8%BE%B02%3B4qqIkcy3NvmegD2EnE%2BsOg2O1jjgPTjDxX3du3GmlWaCk8fr0sJ%2FmubqRXtUqqRoZWsMMmMvcfSg1wNNX8p93XE3fanPRZvbcs7bYEjCeUqg5RMcJtmbM9jEifMzwRAAmCipwh9KbqrYLdkLenTwJYqOaG9qmaZ2qDmn2Pa66eitUxhH2q0aU0kerTnJCi2qJnM8Y0Oc%3Bz%2Bzk7gxq8gdHwxSGucOoQSvBZ44Uaf7Um0f7bFnTUgwLnxWm2OMnlrG9SZX6ezbrsEoqVVrOk%2FVRGekqxUH%2BufKtmb89UVNnA0x62lxu6z84Y8dT0sXAWUELHmWZf8cnumRIL8kPvuAcHSXq5P6pTC3OaxbBeQ%3D%3D',
|
||||||
|
'uidal':
|
||||||
|
'6293426663250936%e4%ba%89%e5%88%86%e5%a4%ba%e7%a7%92%e7%9a%84%e9%a3%8e%e8%be%b02',
|
||||||
|
'sid': '',
|
||||||
|
'vtpst': '|',
|
||||||
|
'websitepoptg_api_time': '1715218615434',
|
||||||
|
'st_si': '46368340182479',
|
||||||
|
'EmFundFavorVersion': '1686749115372',
|
||||||
|
'EmFundFavorVersion2': '1686749115372',
|
||||||
|
'st_asi': 'delete',
|
||||||
|
'EMFUND0': 'null',
|
||||||
|
'st_pvi': '35290886003252',
|
||||||
|
'st_sp': '2023-12-17%2018%3A51%3A34',
|
||||||
|
'st_inirUrl': 'https%3A%2F%2Fcn.bing.com%2F',
|
||||||
|
'st_sn': '27',
|
||||||
|
'st_psi': '20240509100744555-112200305283-5067673963',
|
||||||
|
}
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
'Accept':
|
||||||
|
'*/*',
|
||||||
|
'Accept-Language':
|
||||||
|
'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
|
||||||
|
'Connection':
|
||||||
|
'keep-alive',
|
||||||
|
# 'Cookie': 'qgqp_b_id=5c08ebc12f489b4f5ba9e76c2539ce0b; emshistory=%5B%2200005%22%2C%2200002%22%2C%2200002%E3%80%81%22%2C%2200001%22%5D; HAList=ty-0-300411-%u91D1%u76FE%u80A1%u4EFD%2Cty-0-399366-%u80FD%u6E90%u91D1%u5C5E%2Cty-116-00002-%u4E2D%u7535%u63A7%u80A1%2Cty-116-03119-GX%u4E9A%u6D32%u534A%u5BFC%u4F53%2Cty-116-00007-%u667A%u5BCC%u8D44%u6E90%u6295%u8D44%2Cty-116-00001-%u957F%u548C%2Cty-116-00016-%u65B0%u9E3F%u57FA%u5730%u4EA7%2Cty-0-301075-%u591A%u745E%u533B%u836F%2Cty-90-BK1042-%u533B%u836F%u5546%u4E1A%2Cty-1-601607-%u4E0A%u6D77%u533B%u836F; mtp=1; ct=Rc8QhLQwVpXSsLuf4UOMLbPMtE9gFAEkMTisAatrxh1rv-WFWG9EC-2zw_WFCJnVfsaViwejVO4ziLTZig1GUptw6NORwx36yfzDu9g9zstYkLdwIWvQ-9QqGL-F5C1GCS7xhUtoBrFAibnr_-HA078LL8tr7yWiGM9V3ZmooC8; ut=FobyicMgeV54OLFNgnrRk4fT26HSX01NG2N55VZbVzZlqOMDJ-67DsHyCMk6G-yTMaqRhIAFuiYbVkK6Y-sYY8ghkJ3v9gyvUZyHWYpJnreP78yw4o-H8FNcTvUXmOj4KLsGaYuV1TAHltcdN0WDTy-YCOJ8OlzrX-MQbQc_CBvXfUYn10iBhXwvJY94XBkg4eOCJpu6Dok3ot9Xsr8flPIDz6f3KxJcIgnXZ7QpZKDMIvavpSunuMiR8Q5ezUD2y-JiBEgNkeoH_36wg0elojOfd5k61gTK; pi=6293426663250936%3Bm6293426663250936%3B%E4%BA%89%E5%88%86%E5%A4%BA%E7%A7%92%E7%9A%84%E9%A3%8E%E8%BE%B02%3B4qqIkcy3NvmegD2EnE%2BsOg2O1jjgPTjDxX3du3GmlWaCk8fr0sJ%2FmubqRXtUqqRoZWsMMmMvcfSg1wNNX8p93XE3fanPRZvbcs7bYEjCeUqg5RMcJtmbM9jEifMzwRAAmCipwh9KbqrYLdkLenTwJYqOaG9qmaZ2qDmn2Pa66eitUxhH2q0aU0kerTnJCi2qJnM8Y0Oc%3Bz%2Bzk7gxq8gdHwxSGucOoQSvBZ44Uaf7Um0f7bFnTUgwLnxWm2OMnlrG9SZX6ezbrsEoqVVrOk%2FVRGekqxUH%2BufKtmb89UVNnA0x62lxu6z84Y8dT0sXAWUELHmWZf8cnumRIL8kPvuAcHSXq5P6pTC3OaxbBeQ%3D%3D; uidal=6293426663250936%e4%ba%89%e5%88%86%e5%a4%ba%e7%a7%92%e7%9a%84%e9%a3%8e%e8%be%b02; sid=; vtpst=|; websitepoptg_api_time=1715218615434; st_si=46368340182479; EmFundFavorVersion=1686749115372; EmFundFavorVersion2=1686749115372; st_asi=delete; EMFUND0=null; EMFUND1=05-09%2009%3A49%3A02@%23%24%u534E%u590F%u6210%u957F%u6DF7%u5408@%23%24000001; EMFUND2=05-09%2009%3A53%3A36@%23%24%u5BCC%u56FD%u7CBE%u51C6%u533B%u7597%u6DF7%u5408A@%23%24005176; EMFUND3=05-09%2009%3A54%3A07@%23%24%u94F6%u6CB3%u533B%u836F%u6DF7%u5408A@%23%24011335; EMFUND4=05-09%2009%3A54%3A13@%23%24%u4E1C%u65B9%u7EA2%u533B%u7597%u5347%u7EA7%u80A1%u7968%u53D1%u8D77A@%23%24015052; EMFUND5=05-09%2009%3A57%3A40@%23%24%u5B9D%u76C8%u73B0%u4EE3%u670D%u52A1%u4E1A%u6DF7%u5408A@%23%24009223; EMFUND6=05-09%2009%3A57%3A51@%23%24%u4E1C%u65B9%u7EA2%u533B%u7597%u5347%u7EA7%u80A1%u7968%u53D1%u8D77C@%23%24015053; EMFUND7=05-09%2009%3A58%3A04@%23%24%u5E7F%u53D1%u521B%u65B0%u533B%u7597%u4E24%u5E74%u6301%u6709%u6DF7%u5408A@%23%24010731; EMFUND8=05-09%2009%3A58%3A56@%23%24%u5BCC%u56FD%u751F%u7269%u533B%u836F%u79D1%u6280%u6DF7%u5408A@%23%24006218; EMFUND9=05-09 09:59:24@#$%u534E%u5546%u533B%u836F%u533B%u7597%u884C%u4E1A%u80A1%u7968@%23%24008107; st_pvi=35290886003252; st_sp=2023-12-17%2018%3A51%3A34; st_inirUrl=https%3A%2F%2Fcn.bing.com%2F; st_sn=27; st_psi=20240509100744555-112200305283-5067673963',
|
||||||
|
'Referer':
|
||||||
|
'https://fundf10.eastmoney.com/',
|
||||||
|
'User-Agent':
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0',
|
||||||
|
}
|
||||||
|
|
||||||
|
params = {
|
||||||
|
'callback': 'jQuery183019015669101010957_1715220464680',
|
||||||
|
'fundCode': args[0][0],
|
||||||
|
'pageIndex': args[0][1],
|
||||||
|
'pageSize': '20',
|
||||||
|
'startDate': '',
|
||||||
|
'endDate': '',
|
||||||
|
'_': '1715220492762',
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.get('https://api.fund.eastmoney.com/f10/lsjz',
|
||||||
|
params=params,
|
||||||
|
cookies=cookies,
|
||||||
|
headers=headers)
|
||||||
|
pattern = r'"FSRQ":"(.*?)","DWJZ":"(.*?)"'
|
||||||
|
text = response.text
|
||||||
|
data_page = re.findall(pattern, text)
|
||||||
|
data_list = []
|
||||||
|
for data in data_page:
|
||||||
|
data_list.append(list(data))
|
||||||
|
return data_list
|
||||||
|
|
||||||
|
|
||||||
|
def get_hx_data():
|
||||||
|
"""
|
||||||
|
获取华商医药医疗行业股票基金历史净值数据
|
||||||
|
:return: list of hx_data
|
||||||
|
"""
|
||||||
|
fundcode = '008107'
|
||||||
|
page_list = range(1, 29)
|
||||||
|
hx_data = []
|
||||||
|
args_list = [(fundcode, i) for i in page_list]
|
||||||
|
# 使用多进程处理
|
||||||
|
pool = ThreadPool(100)
|
||||||
|
data_list = pool.map(get_jijin_data, args_list)
|
||||||
|
pool.close()
|
||||||
|
pool.join()
|
||||||
|
for data in data_list:
|
||||||
|
hx_data += data
|
||||||
|
print(hx_data)
|
||||||
|
# 数据储存
|
||||||
|
return hx_data
|
||||||
|
|
||||||
|
|
||||||
|
def get_gf_data():
|
||||||
|
"""
|
||||||
|
获取广发创新医疗两年持有混合基金历史净值数据
|
||||||
|
:return: list of hx_data
|
||||||
|
"""
|
||||||
|
fundcode = '010731'
|
||||||
|
page_list = range(1, 29)
|
||||||
|
gf_data = []
|
||||||
|
args_list = [(fundcode, i) for i in page_list]
|
||||||
|
# 使用多进程处理
|
||||||
|
pool = ThreadPool(100)
|
||||||
|
data_list = pool.map(get_jijin_data, args_list)
|
||||||
|
pool.close()
|
||||||
|
pool.join()
|
||||||
|
for data in data_list:
|
||||||
|
gf_data += data
|
||||||
|
print(gf_data)
|
||||||
|
return gf_data
|
||||||
|
|
||||||
|
|
||||||
|
def save_data_to_csv(data, filename):
|
||||||
|
df = pd.DataFrame(data, columns=['date', filename])
|
||||||
|
df['date'] = pd.to_datetime(df['date'])
|
||||||
|
df = df.sort_values(by='date')
|
||||||
|
df.to_csv(f'{filename}.csv', encoding="utf_8")
|
||||||
|
print(f'成功爬取流感基金数据并保存在{filename}.csv中')
|
||||||
|
|
||||||
|
|
||||||
|
save_data_to_csv(get_hx_data(), 'hx_jijin_data')
|
||||||
|
save_data_to_csv(get_gf_data(), 'gf_jijin_data')
|
Loading…
Reference in new issue