You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1236 lines
44 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_866/224548020.py:1: DtypeWarning: Columns (7) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" user=pd.read_csv('../../data/sample/users.csv',sep='\\t',encoding='utf-8')\n"
]
}
],
"source": [
"user=pd.read_csv('../../data/sample/users.csv',sep='\\t',encoding='utf-8')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_id</th>\n",
" <th>user_name</th>\n",
" <th>gender</th>\n",
" <th>school_id</th>\n",
" <th>school_name</th>\n",
" <th>location</th>\n",
" <th>location_city</th>\n",
" <th>occupation</th>\n",
" <th>identity</th>\n",
" <th>technical_title</th>\n",
" <th>edu_background</th>\n",
" <th>edu_entry_year</th>\n",
" <th>gid</th>\n",
" <th>logins</th>\n",
" <th>grade</th>\n",
" <th>experience</th>\n",
" <th>last_login_on</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>实践教学</td>\n",
" <td>0.0</td>\n",
" <td>38304.0</td>\n",
" <td>头歌教研中心</td>\n",
" <td>湖南</td>\n",
" <td>长沙</td>\n",
" <td>国防科学技术大学</td>\n",
" <td>0.0</td>\n",
" <td>教授</td>\n",
" <td>8.0</td>\n",
" <td>2021.0</td>\n",
" <td>0.0</td>\n",
" <td>63616</td>\n",
" <td>442508</td>\n",
" <td>414368</td>\n",
" <td>2021-11-08 11:20:37</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>5</td>\n",
" <td>尹刚</td>\n",
" <td>0.0</td>\n",
" <td>117.0</td>\n",
" <td>国防科技大学</td>\n",
" <td>湖南</td>\n",
" <td>长沙</td>\n",
" <td>国防科学技术大学</td>\n",
" <td>2.0</td>\n",
" <td>部门管理者</td>\n",
" <td>8.0</td>\n",
" <td>2001.0</td>\n",
" <td>0.0</td>\n",
" <td>56171</td>\n",
" <td>11904</td>\n",
" <td>11120</td>\n",
" <td>2022-04-15 09:10:10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>6</td>\n",
" <td>王林春</td>\n",
" <td>0.0</td>\n",
" <td>1618.0</td>\n",
" <td>湖南工业职业技术学院</td>\n",
" <td>湖南</td>\n",
" <td>长沙</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>讲师</td>\n",
" <td>1.0</td>\n",
" <td>2021.0</td>\n",
" <td>0.0</td>\n",
" <td>3428</td>\n",
" <td>71915</td>\n",
" <td>74300</td>\n",
" <td>2022-04-09 10:17:54</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>7</td>\n",
" <td>王老师</td>\n",
" <td>0.0</td>\n",
" <td>117.0</td>\n",
" <td>国防科技大学</td>\n",
" <td>湖南</td>\n",
" <td>长沙</td>\n",
" <td>国防科学技术大学</td>\n",
" <td>0.0</td>\n",
" <td>教授</td>\n",
" <td>1.0</td>\n",
" <td>2022.0</td>\n",
" <td>0.0</td>\n",
" <td>11882</td>\n",
" <td>2100</td>\n",
" <td>0</td>\n",
" <td>2022-04-15 09:26:34</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>10</td>\n",
" <td>余跃</td>\n",
" <td>0.0</td>\n",
" <td>117.0</td>\n",
" <td>国防科技大学</td>\n",
" <td>湖南</td>\n",
" <td>长沙</td>\n",
" <td>国防科学技术大学</td>\n",
" <td>0.0</td>\n",
" <td>讲师</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>2301</td>\n",
" <td>200</td>\n",
" <td>150</td>\n",
" <td>2022-04-09 16:05:54</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" user_id user_name gender school_id school_name location location_city \\\n",
"0 1 实践教学 0.0 38304.0 头歌教研中心 湖南 长沙 \n",
"1 5 尹刚 0.0 117.0 国防科技大学 湖南 长沙 \n",
"2 6 王林春 0.0 1618.0 湖南工业职业技术学院 湖南 长沙 \n",
"3 7 王老师 0.0 117.0 国防科技大学 湖南 长沙 \n",
"4 10 余跃 0.0 117.0 国防科技大学 湖南 长沙 \n",
"\n",
" occupation identity technical_title edu_background edu_entry_year gid \\\n",
"0 国防科学技术大学 0.0 教授 8.0 2021.0 0.0 \n",
"1 国防科学技术大学 2.0 部门管理者 8.0 2001.0 0.0 \n",
"2 NaN 0.0 讲师 1.0 2021.0 0.0 \n",
"3 国防科学技术大学 0.0 教授 1.0 2022.0 0.0 \n",
"4 国防科学技术大学 0.0 讲师 0.0 0.0 0.0 \n",
"\n",
" logins grade experience last_login_on \n",
"0 63616 442508 414368 2021-11-08 11:20:37 \n",
"1 56171 11904 11120 2022-04-15 09:10:10 \n",
"2 3428 71915 74300 2022-04-09 10:17:54 \n",
"3 11882 2100 0 2022-04-15 09:26:34 \n",
"4 2301 200 150 2022-04-09 16:05:54 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"user.head()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"users_l=user.rename(columns={'visits': 'logins'}, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_id</th>\n",
" <th>user_name</th>\n",
" <th>gender</th>\n",
" <th>school_id</th>\n",
" <th>school_name</th>\n",
" <th>location</th>\n",
" <th>location_city</th>\n",
" <th>occupation</th>\n",
" <th>identity</th>\n",
" <th>technical_title</th>\n",
" <th>edu_background</th>\n",
" <th>edu_entry_year</th>\n",
" <th>gid</th>\n",
" <th>logins</th>\n",
" <th>grade</th>\n",
" <th>experience</th>\n",
" <th>last_login_on</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>实践教学</td>\n",
" <td>0.0</td>\n",
" <td>38304.0</td>\n",
" <td>头歌教研中心</td>\n",
" <td>湖南</td>\n",
" <td>长沙</td>\n",
" <td>国防科学技术大学</td>\n",
" <td>0.0</td>\n",
" <td>教授</td>\n",
" <td>8.0</td>\n",
" <td>2021.0</td>\n",
" <td>0.0</td>\n",
" <td>63616</td>\n",
" <td>442508</td>\n",
" <td>414368</td>\n",
" <td>2021-11-08 11:20:37</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>5</td>\n",
" <td>尹刚</td>\n",
" <td>0.0</td>\n",
" <td>117.0</td>\n",
" <td>国防科技大学</td>\n",
" <td>湖南</td>\n",
" <td>长沙</td>\n",
" <td>国防科学技术大学</td>\n",
" <td>2.0</td>\n",
" <td>部门管理者</td>\n",
" <td>8.0</td>\n",
" <td>2001.0</td>\n",
" <td>0.0</td>\n",
" <td>56171</td>\n",
" <td>11904</td>\n",
" <td>11120</td>\n",
" <td>2022-04-15 09:10:10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>6</td>\n",
" <td>王林春</td>\n",
" <td>0.0</td>\n",
" <td>1618.0</td>\n",
" <td>湖南工业职业技术学院</td>\n",
" <td>湖南</td>\n",
" <td>长沙</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>讲师</td>\n",
" <td>1.0</td>\n",
" <td>2021.0</td>\n",
" <td>0.0</td>\n",
" <td>3428</td>\n",
" <td>71915</td>\n",
" <td>74300</td>\n",
" <td>2022-04-09 10:17:54</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>7</td>\n",
" <td>王老师</td>\n",
" <td>0.0</td>\n",
" <td>117.0</td>\n",
" <td>国防科技大学</td>\n",
" <td>湖南</td>\n",
" <td>长沙</td>\n",
" <td>国防科学技术大学</td>\n",
" <td>0.0</td>\n",
" <td>教授</td>\n",
" <td>1.0</td>\n",
" <td>2022.0</td>\n",
" <td>0.0</td>\n",
" <td>11882</td>\n",
" <td>2100</td>\n",
" <td>0</td>\n",
" <td>2022-04-15 09:26:34</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>10</td>\n",
" <td>余跃</td>\n",
" <td>0.0</td>\n",
" <td>117.0</td>\n",
" <td>国防科技大学</td>\n",
" <td>湖南</td>\n",
" <td>长沙</td>\n",
" <td>国防科学技术大学</td>\n",
" <td>0.0</td>\n",
" <td>讲师</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>2301</td>\n",
" <td>200</td>\n",
" <td>150</td>\n",
" <td>2022-04-09 16:05:54</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>474964</th>\n",
" <td>826943</td>\n",
" <td>李波老师222</td>\n",
" <td>0.0</td>\n",
" <td>1581.0</td>\n",
" <td>湖南科技大学</td>\n",
" <td>湖南</td>\n",
" <td>长沙</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>讲师</td>\n",
" <td>1.0</td>\n",
" <td>2023.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>500</td>\n",
" <td>0</td>\n",
" <td>2023-04-24 10:28:08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>474965</th>\n",
" <td>826944</td>\n",
" <td>DWT</td>\n",
" <td>1.0</td>\n",
" <td>573.0</td>\n",
" <td>吉林农业大学</td>\n",
" <td>吉林</td>\n",
" <td>长春</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" <td>6.0</td>\n",
" <td>2020.0</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>50</td>\n",
" <td>0</td>\n",
" <td>2023-04-19 15:04:24</td>\n",
" </tr>\n",
" <tr>\n",
" <th>474966</th>\n",
" <td>826945</td>\n",
" <td>沈晓绿</td>\n",
" <td>-1.0</td>\n",
" <td>117.0</td>\n",
" <td>国防科技大学</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>-1.0</td>\n",
" <td>0</td>\n",
" <td>500</td>\n",
" <td>0</td>\n",
" <td>2023-04-19 16:54:15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>474967</th>\n",
" <td>826946</td>\n",
" <td>王子豪</td>\n",
" <td>-1.0</td>\n",
" <td>3364.0</td>\n",
" <td>湖南智擎科技有限公司</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2.0</td>\n",
" <td>工程师</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>-1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2023-04-23 10:19:01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>474968</th>\n",
" <td>826947</td>\n",
" <td>周平</td>\n",
" <td>0.0</td>\n",
" <td>117.0</td>\n",
" <td>国防科技大学</td>\n",
" <td>北京</td>\n",
" <td>朝阳</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" <td>6.0</td>\n",
" <td>2019.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>50</td>\n",
" <td>0</td>\n",
" <td>2023-04-24 14:14:28</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>474969 rows × 17 columns</p>\n",
"</div>"
],
"text/plain": [
" user_id user_name gender school_id school_name location \\\n",
"0 1 实践教学 0.0 38304.0 头歌教研中心 湖南 \n",
"1 5 尹刚 0.0 117.0 国防科技大学 湖南 \n",
"2 6 王林春 0.0 1618.0 湖南工业职业技术学院 湖南 \n",
"3 7 王老师 0.0 117.0 国防科技大学 湖南 \n",
"4 10 余跃 0.0 117.0 国防科技大学 湖南 \n",
"... ... ... ... ... ... ... \n",
"474964 826943 李波老师222 0.0 1581.0 湖南科技大学 湖南 \n",
"474965 826944 DWT 1.0 573.0 吉林农业大学 吉林 \n",
"474966 826945 沈晓绿 -1.0 117.0 国防科技大学 NaN \n",
"474967 826946 王子豪 -1.0 3364.0 湖南智擎科技有限公司 NaN \n",
"474968 826947 周平 0.0 117.0 国防科技大学 北京 \n",
"\n",
" location_city occupation identity technical_title edu_background \\\n",
"0 长沙 国防科学技术大学 0.0 教授 8.0 \n",
"1 长沙 国防科学技术大学 2.0 部门管理者 8.0 \n",
"2 长沙 NaN 0.0 讲师 1.0 \n",
"3 长沙 国防科学技术大学 0.0 教授 1.0 \n",
"4 长沙 国防科学技术大学 0.0 讲师 0.0 \n",
"... ... ... ... ... ... \n",
"474964 长沙 NaN 0.0 讲师 1.0 \n",
"474965 长春 NaN 1.0 NaN 6.0 \n",
"474966 NaN NaN 1.0 NaN 0.0 \n",
"474967 NaN NaN 2.0 工程师 0.0 \n",
"474968 朝阳 NaN 1.0 NaN 6.0 \n",
"\n",
" edu_entry_year gid logins grade experience last_login_on \n",
"0 2021.0 0.0 63616 442508 414368 2021-11-08 11:20:37 \n",
"1 2001.0 0.0 56171 11904 11120 2022-04-15 09:10:10 \n",
"2 2021.0 0.0 3428 71915 74300 2022-04-09 10:17:54 \n",
"3 2022.0 0.0 11882 2100 0 2022-04-15 09:26:34 \n",
"4 0.0 0.0 2301 200 150 2022-04-09 16:05:54 \n",
"... ... ... ... ... ... ... \n",
"474964 2023.0 0.0 0 500 0 2023-04-24 10:28:08 \n",
"474965 2020.0 1.0 0 50 0 2023-04-19 15:04:24 \n",
"474966 0.0 -1.0 0 500 0 2023-04-19 16:54:15 \n",
"474967 0.0 -1.0 0 0 0 2023-04-23 10:19:01 \n",
"474968 2019.0 0.0 0 50 0 2023-04-24 14:14:28 \n",
"\n",
"[474969 rows x 17 columns]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"user"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"user.to_csv('../../data/sample/users.csv',sep='\\t', index=False, header=True)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"False\n"
]
}
],
"source": [
"print(pd.isnull(user['experience'].values).any())"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"shixun=pd.read_csv('../../data/sample/shixun_merage_emb.csv',sep='\\t',encoding='utf-8')"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>shixun_id</th>\n",
" <th>visits</th>\n",
" <th>challenges_count</th>\n",
" <th>averge_star</th>\n",
" <th>task_pass</th>\n",
" <th>emb_0</th>\n",
" <th>emb_1</th>\n",
" <th>emb_2</th>\n",
" <th>emb_3</th>\n",
" <th>emb_4</th>\n",
" <th>...</th>\n",
" <th>emb_90</th>\n",
" <th>emb_91</th>\n",
" <th>emb_92</th>\n",
" <th>emb_93</th>\n",
" <th>emb_94</th>\n",
" <th>emb_95</th>\n",
" <th>emb_96</th>\n",
" <th>emb_97</th>\n",
" <th>emb_98</th>\n",
" <th>emb_99</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>43</td>\n",
" <td>1255</td>\n",
" <td>4</td>\n",
" <td>4.8</td>\n",
" <td>1</td>\n",
" <td>0.178059</td>\n",
" <td>-0.223661</td>\n",
" <td>0.431724</td>\n",
" <td>0.222664</td>\n",
" <td>-0.051939</td>\n",
" <td>...</td>\n",
" <td>1.277493</td>\n",
" <td>0.033392</td>\n",
" <td>0.085430</td>\n",
" <td>0.334810</td>\n",
" <td>0.194489</td>\n",
" <td>0.405730</td>\n",
" <td>0.223120</td>\n",
" <td>-0.162922</td>\n",
" <td>0.020598</td>\n",
" <td>0.260939</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>49</td>\n",
" <td>6208</td>\n",
" <td>3</td>\n",
" <td>4.8</td>\n",
" <td>0</td>\n",
" <td>0.029035</td>\n",
" <td>-0.233039</td>\n",
" <td>0.211546</td>\n",
" <td>0.176656</td>\n",
" <td>-0.227015</td>\n",
" <td>...</td>\n",
" <td>0.688604</td>\n",
" <td>0.257212</td>\n",
" <td>0.023406</td>\n",
" <td>0.462871</td>\n",
" <td>0.364423</td>\n",
" <td>0.555712</td>\n",
" <td>0.018691</td>\n",
" <td>0.159427</td>\n",
" <td>-0.310509</td>\n",
" <td>-0.173528</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>50</td>\n",
" <td>2527</td>\n",
" <td>6</td>\n",
" <td>4.8</td>\n",
" <td>0</td>\n",
" <td>-0.098617</td>\n",
" <td>0.150550</td>\n",
" <td>-0.053376</td>\n",
" <td>0.137188</td>\n",
" <td>0.076380</td>\n",
" <td>...</td>\n",
" <td>0.226777</td>\n",
" <td>0.334204</td>\n",
" <td>0.347067</td>\n",
" <td>0.351606</td>\n",
" <td>0.160054</td>\n",
" <td>0.212226</td>\n",
" <td>-0.042232</td>\n",
" <td>0.027391</td>\n",
" <td>-0.163952</td>\n",
" <td>0.083248</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>51</td>\n",
" <td>12651</td>\n",
" <td>3</td>\n",
" <td>4.9</td>\n",
" <td>0</td>\n",
" <td>0.107260</td>\n",
" <td>0.024969</td>\n",
" <td>0.001965</td>\n",
" <td>0.326085</td>\n",
" <td>-0.264945</td>\n",
" <td>...</td>\n",
" <td>0.883407</td>\n",
" <td>0.031283</td>\n",
" <td>-0.068510</td>\n",
" <td>0.483277</td>\n",
" <td>0.856700</td>\n",
" <td>0.685457</td>\n",
" <td>0.132283</td>\n",
" <td>0.315131</td>\n",
" <td>-0.675356</td>\n",
" <td>-0.090122</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>53</td>\n",
" <td>4043</td>\n",
" <td>4</td>\n",
" <td>4.9</td>\n",
" <td>0</td>\n",
" <td>0.143967</td>\n",
" <td>-0.325837</td>\n",
" <td>0.068179</td>\n",
" <td>0.015797</td>\n",
" <td>0.146450</td>\n",
" <td>...</td>\n",
" <td>0.674001</td>\n",
" <td>0.206088</td>\n",
" <td>0.140978</td>\n",
" <td>0.533451</td>\n",
" <td>0.263060</td>\n",
" <td>0.296306</td>\n",
" <td>-0.113665</td>\n",
" <td>0.418402</td>\n",
" <td>-0.367488</td>\n",
" <td>0.065241</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 105 columns</p>\n",
"</div>"
],
"text/plain": [
" shixun_id visits challenges_count averge_star task_pass emb_0 \\\n",
"0 43 1255 4 4.8 1 0.178059 \n",
"1 49 6208 3 4.8 0 0.029035 \n",
"2 50 2527 6 4.8 0 -0.098617 \n",
"3 51 12651 3 4.9 0 0.107260 \n",
"4 53 4043 4 4.9 0 0.143967 \n",
"\n",
" emb_1 emb_2 emb_3 emb_4 ... emb_90 emb_91 emb_92 \\\n",
"0 -0.223661 0.431724 0.222664 -0.051939 ... 1.277493 0.033392 0.085430 \n",
"1 -0.233039 0.211546 0.176656 -0.227015 ... 0.688604 0.257212 0.023406 \n",
"2 0.150550 -0.053376 0.137188 0.076380 ... 0.226777 0.334204 0.347067 \n",
"3 0.024969 0.001965 0.326085 -0.264945 ... 0.883407 0.031283 -0.068510 \n",
"4 -0.325837 0.068179 0.015797 0.146450 ... 0.674001 0.206088 0.140978 \n",
"\n",
" emb_93 emb_94 emb_95 emb_96 emb_97 emb_98 emb_99 \n",
"0 0.334810 0.194489 0.405730 0.223120 -0.162922 0.020598 0.260939 \n",
"1 0.462871 0.364423 0.555712 0.018691 0.159427 -0.310509 -0.173528 \n",
"2 0.351606 0.160054 0.212226 -0.042232 0.027391 -0.163952 0.083248 \n",
"3 0.483277 0.856700 0.685457 0.132283 0.315131 -0.675356 -0.090122 \n",
"4 0.533451 0.263060 0.296306 -0.113665 0.418402 -0.367488 0.065241 \n",
"\n",
"[5 rows x 105 columns]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"shixun.head()\n"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"False\n"
]
}
],
"source": [
"print(pd.isnull(shixun['task_pass'].values).any())"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"list1 = [[5,2,0,4],[2,3,4,5],[3,6,1,9],[4,1,0,8]]\n",
"name=['a','b','c','d']\n",
"df1=pd.DataFrame(list1,columns=name)\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"list1 = [[5,2,0,4],[1,1,1,5],[1,1,1,9],[4,1,0,8]]\n",
"name=['a','e','f','d']\n",
"df2=pd.DataFrame(list1,columns=name)\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" <th>c</th>\n",
" <th>d_x</th>\n",
" <th>e</th>\n",
" <th>f</th>\n",
" <th>d_y</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" a b c d_x e f d_y\n",
"0 5 2 0 4 2 0 4\n",
"1 4 1 0 8 1 0 8"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df=pd.merge(df1,df2,on=['a'])\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"subject=pd.read_csv('../../data/sample/subjects.csv',sep='\\t',encoding='utf-8')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>disciplines_id</th>\n",
" <th>disciplines_name</th>\n",
" <th>sub_discipline_id</th>\n",
" <th>sub_discipline_name</th>\n",
" <th>subject_id</th>\n",
" <th>subject_name</th>\n",
" <th>visits</th>\n",
" <th>status</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>...</th>\n",
" <th>initiative_passed_count</th>\n",
" <th>initiative_challenge_count</th>\n",
" <th>initiative_evaluate_count</th>\n",
" <th>video_study_time</th>\n",
" <th>initiative_video_study_time</th>\n",
" <th>study_pdf_attachment_count</th>\n",
" <th>initiative_study_pdf_attachment_count</th>\n",
" <th>tag_names</th>\n",
" <th>averge_star</th>\n",
" <th>created_at_ts</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>程序设计语言</td>\n",
" <td>2.0</td>\n",
" <td>Java程序设计</td>\n",
" <td>1</td>\n",
" <td>C++程序设计</td>\n",
" <td>2064</td>\n",
" <td>2</td>\n",
" <td>2017-07-17 11:08:11</td>\n",
" <td>2021-12-28 16:56:15</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>运算符与表达式 循环 标识符、关键字 数据类型 数组 集合类 异常 多态 语法 时间 网络 ...</td>\n",
" <td>0.0</td>\n",
" <td>1.500290e+09</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1 rows × 36 columns</p>\n",
"</div>"
],
"text/plain": [
" disciplines_id disciplines_name sub_discipline_id sub_discipline_name \\\n",
"0 1 程序设计语言 2.0 Java程序设计 \n",
"\n",
" subject_id subject_name visits status created_at \\\n",
"0 1 C++程序设计 2064 2 2017-07-17 11:08:11 \n",
"\n",
" updated_at ... initiative_passed_count \\\n",
"0 2021-12-28 16:56:15 ... NaN \n",
"\n",
" initiative_challenge_count initiative_evaluate_count video_study_time \\\n",
"0 NaN NaN NaN \n",
"\n",
" initiative_video_study_time study_pdf_attachment_count \\\n",
"0 NaN 0.0 \n",
"\n",
" initiative_study_pdf_attachment_count \\\n",
"0 NaN \n",
"\n",
" tag_names averge_star \\\n",
"0 运算符与表达式 循环 标识符、关键字 数据类型 数组 集合类 异常 多态 语法 时间 网络 ... 0.0 \n",
"\n",
" created_at_ts \n",
"0 1.500290e+09 \n",
"\n",
"[1 rows x 36 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"subject.head(1)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 1605 entries, 0 to 1604\n",
"Data columns (total 36 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 disciplines_id 1605 non-null int64 \n",
" 1 disciplines_name 431 non-null object \n",
" 2 sub_discipline_id 431 non-null float64\n",
" 3 sub_discipline_name 431 non-null object \n",
" 4 subject_id 1605 non-null int64 \n",
" 5 subject_name 1605 non-null object \n",
" 6 visits 1605 non-null int64 \n",
" 7 status 1605 non-null int64 \n",
" 8 created_at 1605 non-null object \n",
" 9 updated_at 1605 non-null object \n",
" 10 stages_count 1605 non-null int64 \n",
" 11 stage_shixuns_count 1605 non-null int64 \n",
" 12 publish_time 1030 non-null object \n",
" 13 homepage_show 1605 non-null int64 \n",
" 14 repertoire_id 467 non-null float64\n",
" 15 score_count 93 non-null float64\n",
" 16 shixuns_count 1605 non-null int64 \n",
" 17 study_count 1605 non-null float64\n",
" 18 course_study_count 1605 non-null float64\n",
" 19 initiative_study 768 non-null float64\n",
" 20 passed_count 1605 non-null float64\n",
" 21 course_used_count 768 non-null float64\n",
" 22 school_used_count 768 non-null float64\n",
" 23 challenge_count 1605 non-null float64\n",
" 24 evaluate_count 1605 non-null float64\n",
" 25 initiative_school_used_count 768 non-null float64\n",
" 26 initiative_passed_count 768 non-null float64\n",
" 27 initiative_challenge_count 768 non-null float64\n",
" 28 initiative_evaluate_count 768 non-null float64\n",
" 29 video_study_time 768 non-null float64\n",
" 30 initiative_video_study_time 768 non-null float64\n",
" 31 study_pdf_attachment_count 1605 non-null float64\n",
" 32 initiative_study_pdf_attachment_count 768 non-null float64\n",
" 33 tag_names 431 non-null object \n",
" 34 averge_star 1605 non-null float64\n",
" 35 created_at_ts 1605 non-null float64\n",
"dtypes: float64(21), int64(8), object(7)\n",
"memory usage: 451.5+ KB\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_8261/2736341527.py:1: FutureWarning: null_counts is deprecated. Use show_counts instead\n",
" subject.info(verbose=True, null_counts=True)\n"
]
}
],
"source": [
"subject.info(verbose=True, null_counts=True)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"subject[\"study_count\"]=subject[\"study_count\"].fillna(value=0)\n",
"subject[\"course_study_count\"]=subject[\"course_study_count\"].fillna(value=0)\n",
"subject[\"passed_count\"]=subject[\"passed_count\"].fillna(value=0)\n",
"subject[\"challenge_count\"]=subject[\"challenge_count\"].fillna(value=0)\n",
"subject[\"evaluate_count\"]=subject[\"evaluate_count\"].fillna(value=0)\n",
"subject[\"study_pdf_attachment_count\"]=subject[\"study_pdf_attachment_count\"].fillna(value=0)\n",
"subject[\"averge_star\"]=subject[\"averge_star\"].fillna(value=0)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 1605 entries, 0 to 1604\n",
"Data columns (total 36 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 disciplines_id 1605 non-null int64 \n",
" 1 disciplines_name 431 non-null object \n",
" 2 sub_discipline_id 431 non-null float64\n",
" 3 sub_discipline_name 431 non-null object \n",
" 4 subject_id 1605 non-null int64 \n",
" 5 subject_name 1605 non-null object \n",
" 6 visits 1605 non-null int64 \n",
" 7 status 1605 non-null int64 \n",
" 8 created_at 1605 non-null object \n",
" 9 updated_at 1605 non-null object \n",
" 10 stages_count 1605 non-null int64 \n",
" 11 stage_shixuns_count 1605 non-null int64 \n",
" 12 publish_time 1030 non-null object \n",
" 13 homepage_show 1605 non-null int64 \n",
" 14 repertoire_id 467 non-null float64\n",
" 15 score_count 93 non-null float64\n",
" 16 shixuns_count 1605 non-null int64 \n",
" 17 study_count 1605 non-null float64\n",
" 18 course_study_count 1605 non-null float64\n",
" 19 initiative_study 768 non-null float64\n",
" 20 passed_count 1605 non-null float64\n",
" 21 course_used_count 768 non-null float64\n",
" 22 school_used_count 768 non-null float64\n",
" 23 challenge_count 1605 non-null float64\n",
" 24 evaluate_count 1605 non-null float64\n",
" 25 initiative_school_used_count 768 non-null float64\n",
" 26 initiative_passed_count 768 non-null float64\n",
" 27 initiative_challenge_count 768 non-null float64\n",
" 28 initiative_evaluate_count 768 non-null float64\n",
" 29 video_study_time 768 non-null float64\n",
" 30 initiative_video_study_time 768 non-null float64\n",
" 31 study_pdf_attachment_count 1605 non-null float64\n",
" 32 initiative_study_pdf_attachment_count 768 non-null float64\n",
" 33 tag_names 431 non-null object \n",
" 34 averge_star 1605 non-null float64\n",
" 35 created_at_ts 1605 non-null float64\n",
"dtypes: float64(21), int64(8), object(7)\n",
"memory usage: 451.5+ KB\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_8261/2736341527.py:1: FutureWarning: null_counts is deprecated. Use show_counts instead\n",
" subject.info(verbose=True, null_counts=True)\n"
]
}
],
"source": [
"subject.info(verbose=True, null_counts=True)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"subject.to_csv('../../data/sample/subjects.csv',sep='\\t', index=False, header=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "mooc",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.2"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}