You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1305 lines
41 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_15100/224548020.py:1: DtypeWarning: Columns (7) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" user=pd.read_csv('../../data/sample/users.csv',sep='\\t',encoding='utf-8')\n"
]
}
],
"source": [
"user=pd.read_csv('../../data/sample/users.csv',sep='\\t',encoding='utf-8')"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_id</th>\n",
" <th>user_name</th>\n",
" <th>gender</th>\n",
" <th>school_id</th>\n",
" <th>school_name</th>\n",
" <th>location</th>\n",
" <th>location_city</th>\n",
" <th>occupation</th>\n",
" <th>identity</th>\n",
" <th>technical_title</th>\n",
" <th>edu_background</th>\n",
" <th>edu_entry_year</th>\n",
" <th>gid</th>\n",
" <th>logins</th>\n",
" <th>grade</th>\n",
" <th>experience</th>\n",
" <th>last_login_on</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>实践教学</td>\n",
" <td>0.0</td>\n",
" <td>38304.0</td>\n",
" <td>头歌教研中心</td>\n",
" <td>湖南</td>\n",
" <td>长沙</td>\n",
" <td>国防科学技术大学</td>\n",
" <td>0.0</td>\n",
" <td>教授</td>\n",
" <td>8.0</td>\n",
" <td>2021.0</td>\n",
" <td>0.0</td>\n",
" <td>63616</td>\n",
" <td>442508</td>\n",
" <td>414368</td>\n",
" <td>2021-11-08 11:20:37</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>5</td>\n",
" <td>尹刚</td>\n",
" <td>0.0</td>\n",
" <td>117.0</td>\n",
" <td>国防科技大学</td>\n",
" <td>湖南</td>\n",
" <td>长沙</td>\n",
" <td>国防科学技术大学</td>\n",
" <td>2.0</td>\n",
" <td>部门管理者</td>\n",
" <td>8.0</td>\n",
" <td>2001.0</td>\n",
" <td>0.0</td>\n",
" <td>56171</td>\n",
" <td>11904</td>\n",
" <td>11120</td>\n",
" <td>2022-04-15 09:10:10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>6</td>\n",
" <td>王林春</td>\n",
" <td>0.0</td>\n",
" <td>1618.0</td>\n",
" <td>湖南工业职业技术学院</td>\n",
" <td>湖南</td>\n",
" <td>长沙</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>讲师</td>\n",
" <td>1.0</td>\n",
" <td>2021.0</td>\n",
" <td>0.0</td>\n",
" <td>3428</td>\n",
" <td>71915</td>\n",
" <td>74300</td>\n",
" <td>2022-04-09 10:17:54</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>7</td>\n",
" <td>王老师</td>\n",
" <td>0.0</td>\n",
" <td>117.0</td>\n",
" <td>国防科技大学</td>\n",
" <td>湖南</td>\n",
" <td>长沙</td>\n",
" <td>国防科学技术大学</td>\n",
" <td>0.0</td>\n",
" <td>教授</td>\n",
" <td>1.0</td>\n",
" <td>2022.0</td>\n",
" <td>0.0</td>\n",
" <td>11882</td>\n",
" <td>2100</td>\n",
" <td>0</td>\n",
" <td>2022-04-15 09:26:34</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>10</td>\n",
" <td>余跃</td>\n",
" <td>0.0</td>\n",
" <td>117.0</td>\n",
" <td>国防科技大学</td>\n",
" <td>湖南</td>\n",
" <td>长沙</td>\n",
" <td>国防科学技术大学</td>\n",
" <td>0.0</td>\n",
" <td>讲师</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>2301</td>\n",
" <td>200</td>\n",
" <td>150</td>\n",
" <td>2022-04-09 16:05:54</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" user_id user_name gender school_id school_name location location_city \\\n",
"0 1 实践教学 0.0 38304.0 头歌教研中心 湖南 长沙 \n",
"1 5 尹刚 0.0 117.0 国防科技大学 湖南 长沙 \n",
"2 6 王林春 0.0 1618.0 湖南工业职业技术学院 湖南 长沙 \n",
"3 7 王老师 0.0 117.0 国防科技大学 湖南 长沙 \n",
"4 10 余跃 0.0 117.0 国防科技大学 湖南 长沙 \n",
"\n",
" occupation identity technical_title edu_background edu_entry_year gid \\\n",
"0 国防科学技术大学 0.0 教授 8.0 2021.0 0.0 \n",
"1 国防科学技术大学 2.0 部门管理者 8.0 2001.0 0.0 \n",
"2 NaN 0.0 讲师 1.0 2021.0 0.0 \n",
"3 国防科学技术大学 0.0 教授 1.0 2022.0 0.0 \n",
"4 国防科学技术大学 0.0 讲师 0.0 0.0 0.0 \n",
"\n",
" logins grade experience last_login_on \n",
"0 63616 442508 414368 2021-11-08 11:20:37 \n",
"1 56171 11904 11120 2022-04-15 09:10:10 \n",
"2 3428 71915 74300 2022-04-09 10:17:54 \n",
"3 11882 2100 0 2022-04-15 09:26:34 \n",
"4 2301 200 150 2022-04-09 16:05:54 "
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"user.head()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"users_l=user.rename(columns={'visits': 'logins'}, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_id</th>\n",
" <th>user_name</th>\n",
" <th>gender</th>\n",
" <th>school_id</th>\n",
" <th>school_name</th>\n",
" <th>location</th>\n",
" <th>location_city</th>\n",
" <th>occupation</th>\n",
" <th>identity</th>\n",
" <th>technical_title</th>\n",
" <th>edu_background</th>\n",
" <th>edu_entry_year</th>\n",
" <th>gid</th>\n",
" <th>logins</th>\n",
" <th>grade</th>\n",
" <th>experience</th>\n",
" <th>last_login_on</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>实践教学</td>\n",
" <td>0.0</td>\n",
" <td>38304.0</td>\n",
" <td>头歌教研中心</td>\n",
" <td>湖南</td>\n",
" <td>长沙</td>\n",
" <td>国防科学技术大学</td>\n",
" <td>0.0</td>\n",
" <td>教授</td>\n",
" <td>8.0</td>\n",
" <td>2021.0</td>\n",
" <td>0.0</td>\n",
" <td>63616</td>\n",
" <td>442508</td>\n",
" <td>414368</td>\n",
" <td>2021-11-08 11:20:37</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>5</td>\n",
" <td>尹刚</td>\n",
" <td>0.0</td>\n",
" <td>117.0</td>\n",
" <td>国防科技大学</td>\n",
" <td>湖南</td>\n",
" <td>长沙</td>\n",
" <td>国防科学技术大学</td>\n",
" <td>2.0</td>\n",
" <td>部门管理者</td>\n",
" <td>8.0</td>\n",
" <td>2001.0</td>\n",
" <td>0.0</td>\n",
" <td>56171</td>\n",
" <td>11904</td>\n",
" <td>11120</td>\n",
" <td>2022-04-15 09:10:10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>6</td>\n",
" <td>王林春</td>\n",
" <td>0.0</td>\n",
" <td>1618.0</td>\n",
" <td>湖南工业职业技术学院</td>\n",
" <td>湖南</td>\n",
" <td>长沙</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>讲师</td>\n",
" <td>1.0</td>\n",
" <td>2021.0</td>\n",
" <td>0.0</td>\n",
" <td>3428</td>\n",
" <td>71915</td>\n",
" <td>74300</td>\n",
" <td>2022-04-09 10:17:54</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>7</td>\n",
" <td>王老师</td>\n",
" <td>0.0</td>\n",
" <td>117.0</td>\n",
" <td>国防科技大学</td>\n",
" <td>湖南</td>\n",
" <td>长沙</td>\n",
" <td>国防科学技术大学</td>\n",
" <td>0.0</td>\n",
" <td>教授</td>\n",
" <td>1.0</td>\n",
" <td>2022.0</td>\n",
" <td>0.0</td>\n",
" <td>11882</td>\n",
" <td>2100</td>\n",
" <td>0</td>\n",
" <td>2022-04-15 09:26:34</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>10</td>\n",
" <td>余跃</td>\n",
" <td>0.0</td>\n",
" <td>117.0</td>\n",
" <td>国防科技大学</td>\n",
" <td>湖南</td>\n",
" <td>长沙</td>\n",
" <td>国防科学技术大学</td>\n",
" <td>0.0</td>\n",
" <td>讲师</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>2301</td>\n",
" <td>200</td>\n",
" <td>150</td>\n",
" <td>2022-04-09 16:05:54</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>474964</th>\n",
" <td>826943</td>\n",
" <td>李波老师222</td>\n",
" <td>0.0</td>\n",
" <td>1581.0</td>\n",
" <td>湖南科技大学</td>\n",
" <td>湖南</td>\n",
" <td>长沙</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>讲师</td>\n",
" <td>1.0</td>\n",
" <td>2023.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>500</td>\n",
" <td>0</td>\n",
" <td>2023-04-24 10:28:08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>474965</th>\n",
" <td>826944</td>\n",
" <td>DWT</td>\n",
" <td>1.0</td>\n",
" <td>573.0</td>\n",
" <td>吉林农业大学</td>\n",
" <td>吉林</td>\n",
" <td>长春</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" <td>6.0</td>\n",
" <td>2020.0</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>50</td>\n",
" <td>0</td>\n",
" <td>2023-04-19 15:04:24</td>\n",
" </tr>\n",
" <tr>\n",
" <th>474966</th>\n",
" <td>826945</td>\n",
" <td>沈晓绿</td>\n",
" <td>-1.0</td>\n",
" <td>117.0</td>\n",
" <td>国防科技大学</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>-1.0</td>\n",
" <td>0</td>\n",
" <td>500</td>\n",
" <td>0</td>\n",
" <td>2023-04-19 16:54:15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>474967</th>\n",
" <td>826946</td>\n",
" <td>王子豪</td>\n",
" <td>-1.0</td>\n",
" <td>3364.0</td>\n",
" <td>湖南智擎科技有限公司</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2.0</td>\n",
" <td>工程师</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>-1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2023-04-23 10:19:01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>474968</th>\n",
" <td>826947</td>\n",
" <td>周平</td>\n",
" <td>0.0</td>\n",
" <td>117.0</td>\n",
" <td>国防科技大学</td>\n",
" <td>北京</td>\n",
" <td>朝阳</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" <td>6.0</td>\n",
" <td>2019.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>50</td>\n",
" <td>0</td>\n",
" <td>2023-04-24 14:14:28</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>474969 rows × 17 columns</p>\n",
"</div>"
],
"text/plain": [
" user_id user_name gender school_id school_name location \\\n",
"0 1 实践教学 0.0 38304.0 头歌教研中心 湖南 \n",
"1 5 尹刚 0.0 117.0 国防科技大学 湖南 \n",
"2 6 王林春 0.0 1618.0 湖南工业职业技术学院 湖南 \n",
"3 7 王老师 0.0 117.0 国防科技大学 湖南 \n",
"4 10 余跃 0.0 117.0 国防科技大学 湖南 \n",
"... ... ... ... ... ... ... \n",
"474964 826943 李波老师222 0.0 1581.0 湖南科技大学 湖南 \n",
"474965 826944 DWT 1.0 573.0 吉林农业大学 吉林 \n",
"474966 826945 沈晓绿 -1.0 117.0 国防科技大学 NaN \n",
"474967 826946 王子豪 -1.0 3364.0 湖南智擎科技有限公司 NaN \n",
"474968 826947 周平 0.0 117.0 国防科技大学 北京 \n",
"\n",
" location_city occupation identity technical_title edu_background \\\n",
"0 长沙 国防科学技术大学 0.0 教授 8.0 \n",
"1 长沙 国防科学技术大学 2.0 部门管理者 8.0 \n",
"2 长沙 NaN 0.0 讲师 1.0 \n",
"3 长沙 国防科学技术大学 0.0 教授 1.0 \n",
"4 长沙 国防科学技术大学 0.0 讲师 0.0 \n",
"... ... ... ... ... ... \n",
"474964 长沙 NaN 0.0 讲师 1.0 \n",
"474965 长春 NaN 1.0 NaN 6.0 \n",
"474966 NaN NaN 1.0 NaN 0.0 \n",
"474967 NaN NaN 2.0 工程师 0.0 \n",
"474968 朝阳 NaN 1.0 NaN 6.0 \n",
"\n",
" edu_entry_year gid logins grade experience last_login_on \n",
"0 2021.0 0.0 63616 442508 414368 2021-11-08 11:20:37 \n",
"1 2001.0 0.0 56171 11904 11120 2022-04-15 09:10:10 \n",
"2 2021.0 0.0 3428 71915 74300 2022-04-09 10:17:54 \n",
"3 2022.0 0.0 11882 2100 0 2022-04-15 09:26:34 \n",
"4 0.0 0.0 2301 200 150 2022-04-09 16:05:54 \n",
"... ... ... ... ... ... ... \n",
"474964 2023.0 0.0 0 500 0 2023-04-24 10:28:08 \n",
"474965 2020.0 1.0 0 50 0 2023-04-19 15:04:24 \n",
"474966 0.0 -1.0 0 500 0 2023-04-19 16:54:15 \n",
"474967 0.0 -1.0 0 0 0 2023-04-23 10:19:01 \n",
"474968 2019.0 0.0 0 50 0 2023-04-24 14:14:28 \n",
"\n",
"[474969 rows x 17 columns]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"user"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"user.to_csv('../../data/sample/users.csv',sep='\\t', index=False, header=True)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"False\n"
]
}
],
"source": [
"print(pd.isnull(user['experience'].values).any())"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"shixun=pd.read_csv('../../data/sample/shixun_merage_emb.csv',sep='\\t',encoding='utf-8')"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>shixun_id</th>\n",
" <th>visits</th>\n",
" <th>challenges_count</th>\n",
" <th>averge_star</th>\n",
" <th>task_pass</th>\n",
" <th>emb_0</th>\n",
" <th>emb_1</th>\n",
" <th>emb_2</th>\n",
" <th>emb_3</th>\n",
" <th>emb_4</th>\n",
" <th>...</th>\n",
" <th>emb_90</th>\n",
" <th>emb_91</th>\n",
" <th>emb_92</th>\n",
" <th>emb_93</th>\n",
" <th>emb_94</th>\n",
" <th>emb_95</th>\n",
" <th>emb_96</th>\n",
" <th>emb_97</th>\n",
" <th>emb_98</th>\n",
" <th>emb_99</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>43</td>\n",
" <td>1255</td>\n",
" <td>4</td>\n",
" <td>4.8</td>\n",
" <td>1</td>\n",
" <td>0.178059</td>\n",
" <td>-0.223661</td>\n",
" <td>0.431724</td>\n",
" <td>0.222664</td>\n",
" <td>-0.051939</td>\n",
" <td>...</td>\n",
" <td>1.277493</td>\n",
" <td>0.033392</td>\n",
" <td>0.085430</td>\n",
" <td>0.334810</td>\n",
" <td>0.194489</td>\n",
" <td>0.405730</td>\n",
" <td>0.223120</td>\n",
" <td>-0.162922</td>\n",
" <td>0.020598</td>\n",
" <td>0.260939</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>49</td>\n",
" <td>6208</td>\n",
" <td>3</td>\n",
" <td>4.8</td>\n",
" <td>0</td>\n",
" <td>0.029035</td>\n",
" <td>-0.233039</td>\n",
" <td>0.211546</td>\n",
" <td>0.176656</td>\n",
" <td>-0.227015</td>\n",
" <td>...</td>\n",
" <td>0.688604</td>\n",
" <td>0.257212</td>\n",
" <td>0.023406</td>\n",
" <td>0.462871</td>\n",
" <td>0.364423</td>\n",
" <td>0.555712</td>\n",
" <td>0.018691</td>\n",
" <td>0.159427</td>\n",
" <td>-0.310509</td>\n",
" <td>-0.173528</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>50</td>\n",
" <td>2527</td>\n",
" <td>6</td>\n",
" <td>4.8</td>\n",
" <td>0</td>\n",
" <td>-0.098617</td>\n",
" <td>0.150550</td>\n",
" <td>-0.053376</td>\n",
" <td>0.137188</td>\n",
" <td>0.076380</td>\n",
" <td>...</td>\n",
" <td>0.226777</td>\n",
" <td>0.334204</td>\n",
" <td>0.347067</td>\n",
" <td>0.351606</td>\n",
" <td>0.160054</td>\n",
" <td>0.212226</td>\n",
" <td>-0.042232</td>\n",
" <td>0.027391</td>\n",
" <td>-0.163952</td>\n",
" <td>0.083248</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>51</td>\n",
" <td>12651</td>\n",
" <td>3</td>\n",
" <td>4.9</td>\n",
" <td>0</td>\n",
" <td>0.107260</td>\n",
" <td>0.024969</td>\n",
" <td>0.001965</td>\n",
" <td>0.326085</td>\n",
" <td>-0.264945</td>\n",
" <td>...</td>\n",
" <td>0.883407</td>\n",
" <td>0.031283</td>\n",
" <td>-0.068510</td>\n",
" <td>0.483277</td>\n",
" <td>0.856700</td>\n",
" <td>0.685457</td>\n",
" <td>0.132283</td>\n",
" <td>0.315131</td>\n",
" <td>-0.675356</td>\n",
" <td>-0.090122</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>53</td>\n",
" <td>4043</td>\n",
" <td>4</td>\n",
" <td>4.9</td>\n",
" <td>0</td>\n",
" <td>0.143967</td>\n",
" <td>-0.325837</td>\n",
" <td>0.068179</td>\n",
" <td>0.015797</td>\n",
" <td>0.146450</td>\n",
" <td>...</td>\n",
" <td>0.674001</td>\n",
" <td>0.206088</td>\n",
" <td>0.140978</td>\n",
" <td>0.533451</td>\n",
" <td>0.263060</td>\n",
" <td>0.296306</td>\n",
" <td>-0.113665</td>\n",
" <td>0.418402</td>\n",
" <td>-0.367488</td>\n",
" <td>0.065241</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 105 columns</p>\n",
"</div>"
],
"text/plain": [
" shixun_id visits challenges_count averge_star task_pass emb_0 \\\n",
"0 43 1255 4 4.8 1 0.178059 \n",
"1 49 6208 3 4.8 0 0.029035 \n",
"2 50 2527 6 4.8 0 -0.098617 \n",
"3 51 12651 3 4.9 0 0.107260 \n",
"4 53 4043 4 4.9 0 0.143967 \n",
"\n",
" emb_1 emb_2 emb_3 emb_4 ... emb_90 emb_91 emb_92 \\\n",
"0 -0.223661 0.431724 0.222664 -0.051939 ... 1.277493 0.033392 0.085430 \n",
"1 -0.233039 0.211546 0.176656 -0.227015 ... 0.688604 0.257212 0.023406 \n",
"2 0.150550 -0.053376 0.137188 0.076380 ... 0.226777 0.334204 0.347067 \n",
"3 0.024969 0.001965 0.326085 -0.264945 ... 0.883407 0.031283 -0.068510 \n",
"4 -0.325837 0.068179 0.015797 0.146450 ... 0.674001 0.206088 0.140978 \n",
"\n",
" emb_93 emb_94 emb_95 emb_96 emb_97 emb_98 emb_99 \n",
"0 0.334810 0.194489 0.405730 0.223120 -0.162922 0.020598 0.260939 \n",
"1 0.462871 0.364423 0.555712 0.018691 0.159427 -0.310509 -0.173528 \n",
"2 0.351606 0.160054 0.212226 -0.042232 0.027391 -0.163952 0.083248 \n",
"3 0.483277 0.856700 0.685457 0.132283 0.315131 -0.675356 -0.090122 \n",
"4 0.533451 0.263060 0.296306 -0.113665 0.418402 -0.367488 0.065241 \n",
"\n",
"[5 rows x 105 columns]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"shixun.head()\n"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"False\n"
]
}
],
"source": [
"print(pd.isnull(shixun['task_pass'].values).any())"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"list1 = [[5,2,0,4],[2,3,4,5],[3,6,1,9],[4,1,0,8]]\n",
"name=['a','b','c','d']\n",
"df1=pd.DataFrame(list1,columns=name)\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"list1 = [[5,2,0,4],[1,1,1,5],[1,1,1,9],[4,1,0,8]]\n",
"name=['a','e','f','d']\n",
"df2=pd.DataFrame(list1,columns=name)\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" <th>c</th>\n",
" <th>d_x</th>\n",
" <th>e</th>\n",
" <th>f</th>\n",
" <th>d_y</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" a b c d_x e f d_y\n",
"0 5 2 0 4 2 0 4\n",
"1 4 1 0 8 1 0 8"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df=pd.merge(df1,df2,on=['a'])\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"shixun=pd.read_csv('../../data/sample/shixun_merage_emb.csv',sep='\\t',encoding='utf-8')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>shixun_id</th>\n",
" <th>visits</th>\n",
" <th>challenges_count</th>\n",
" <th>averge_star</th>\n",
" <th>task_pass</th>\n",
" <th>emb_0</th>\n",
" <th>emb_1</th>\n",
" <th>emb_2</th>\n",
" <th>emb_3</th>\n",
" <th>emb_4</th>\n",
" <th>...</th>\n",
" <th>emb_90</th>\n",
" <th>emb_91</th>\n",
" <th>emb_92</th>\n",
" <th>emb_93</th>\n",
" <th>emb_94</th>\n",
" <th>emb_95</th>\n",
" <th>emb_96</th>\n",
" <th>emb_97</th>\n",
" <th>emb_98</th>\n",
" <th>emb_99</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>43</td>\n",
" <td>1255</td>\n",
" <td>4</td>\n",
" <td>4.8</td>\n",
" <td>1</td>\n",
" <td>0.178059</td>\n",
" <td>-0.223661</td>\n",
" <td>0.431724</td>\n",
" <td>0.222664</td>\n",
" <td>-0.051939</td>\n",
" <td>...</td>\n",
" <td>1.277493</td>\n",
" <td>0.033392</td>\n",
" <td>0.085430</td>\n",
" <td>0.334810</td>\n",
" <td>0.194489</td>\n",
" <td>0.405730</td>\n",
" <td>0.223120</td>\n",
" <td>-0.162922</td>\n",
" <td>0.020598</td>\n",
" <td>0.260939</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>49</td>\n",
" <td>6208</td>\n",
" <td>3</td>\n",
" <td>4.8</td>\n",
" <td>0</td>\n",
" <td>0.029035</td>\n",
" <td>-0.233039</td>\n",
" <td>0.211546</td>\n",
" <td>0.176656</td>\n",
" <td>-0.227015</td>\n",
" <td>...</td>\n",
" <td>0.688604</td>\n",
" <td>0.257212</td>\n",
" <td>0.023406</td>\n",
" <td>0.462871</td>\n",
" <td>0.364423</td>\n",
" <td>0.555712</td>\n",
" <td>0.018691</td>\n",
" <td>0.159427</td>\n",
" <td>-0.310509</td>\n",
" <td>-0.173528</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>50</td>\n",
" <td>2527</td>\n",
" <td>6</td>\n",
" <td>4.8</td>\n",
" <td>0</td>\n",
" <td>-0.098617</td>\n",
" <td>0.150550</td>\n",
" <td>-0.053376</td>\n",
" <td>0.137188</td>\n",
" <td>0.076380</td>\n",
" <td>...</td>\n",
" <td>0.226777</td>\n",
" <td>0.334204</td>\n",
" <td>0.347067</td>\n",
" <td>0.351606</td>\n",
" <td>0.160054</td>\n",
" <td>0.212226</td>\n",
" <td>-0.042232</td>\n",
" <td>0.027391</td>\n",
" <td>-0.163952</td>\n",
" <td>0.083248</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>51</td>\n",
" <td>12651</td>\n",
" <td>3</td>\n",
" <td>4.9</td>\n",
" <td>0</td>\n",
" <td>0.107260</td>\n",
" <td>0.024969</td>\n",
" <td>0.001965</td>\n",
" <td>0.326085</td>\n",
" <td>-0.264945</td>\n",
" <td>...</td>\n",
" <td>0.883407</td>\n",
" <td>0.031283</td>\n",
" <td>-0.068510</td>\n",
" <td>0.483277</td>\n",
" <td>0.856700</td>\n",
" <td>0.685457</td>\n",
" <td>0.132283</td>\n",
" <td>0.315131</td>\n",
" <td>-0.675356</td>\n",
" <td>-0.090122</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>53</td>\n",
" <td>4043</td>\n",
" <td>4</td>\n",
" <td>4.9</td>\n",
" <td>0</td>\n",
" <td>0.143967</td>\n",
" <td>-0.325837</td>\n",
" <td>0.068179</td>\n",
" <td>0.015797</td>\n",
" <td>0.146450</td>\n",
" <td>...</td>\n",
" <td>0.674001</td>\n",
" <td>0.206088</td>\n",
" <td>0.140978</td>\n",
" <td>0.533451</td>\n",
" <td>0.263060</td>\n",
" <td>0.296306</td>\n",
" <td>-0.113665</td>\n",
" <td>0.418402</td>\n",
" <td>-0.367488</td>\n",
" <td>0.065241</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 105 columns</p>\n",
"</div>"
],
"text/plain": [
" shixun_id visits challenges_count averge_star task_pass emb_0 \\\n",
"0 43 1255 4 4.8 1 0.178059 \n",
"1 49 6208 3 4.8 0 0.029035 \n",
"2 50 2527 6 4.8 0 -0.098617 \n",
"3 51 12651 3 4.9 0 0.107260 \n",
"4 53 4043 4 4.9 0 0.143967 \n",
"\n",
" emb_1 emb_2 emb_3 emb_4 ... emb_90 emb_91 emb_92 \\\n",
"0 -0.223661 0.431724 0.222664 -0.051939 ... 1.277493 0.033392 0.085430 \n",
"1 -0.233039 0.211546 0.176656 -0.227015 ... 0.688604 0.257212 0.023406 \n",
"2 0.150550 -0.053376 0.137188 0.076380 ... 0.226777 0.334204 0.347067 \n",
"3 0.024969 0.001965 0.326085 -0.264945 ... 0.883407 0.031283 -0.068510 \n",
"4 -0.325837 0.068179 0.015797 0.146450 ... 0.674001 0.206088 0.140978 \n",
"\n",
" emb_93 emb_94 emb_95 emb_96 emb_97 emb_98 emb_99 \n",
"0 0.334810 0.194489 0.405730 0.223120 -0.162922 0.020598 0.260939 \n",
"1 0.462871 0.364423 0.555712 0.018691 0.159427 -0.310509 -0.173528 \n",
"2 0.351606 0.160054 0.212226 -0.042232 0.027391 -0.163952 0.083248 \n",
"3 0.483277 0.856700 0.685457 0.132283 0.315131 -0.675356 -0.090122 \n",
"4 0.533451 0.263060 0.296306 -0.113665 0.418402 -0.367488 0.065241 \n",
"\n",
"[5 rows x 105 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"shixun.head()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"item_emb_df = pd.read_csv('../../data/sample/shixun_merage_emb.csv', sep='\\t', encoding='utf-8')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"item_emb_cols = [x for x in item_emb_df.columns if 'shixun_id' not in x]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['visits',\n",
" 'challenges_count',\n",
" 'averge_star',\n",
" 'task_pass',\n",
" 'emb_0',\n",
" 'emb_1',\n",
" 'emb_2',\n",
" 'emb_3',\n",
" 'emb_4',\n",
" 'emb_5',\n",
" 'emb_6',\n",
" 'emb_7',\n",
" 'emb_8',\n",
" 'emb_9',\n",
" 'emb_10',\n",
" 'emb_11',\n",
" 'emb_12',\n",
" 'emb_13',\n",
" 'emb_14',\n",
" 'emb_15',\n",
" 'emb_16',\n",
" 'emb_17',\n",
" 'emb_18',\n",
" 'emb_19',\n",
" 'emb_20',\n",
" 'emb_21',\n",
" 'emb_22',\n",
" 'emb_23',\n",
" 'emb_24',\n",
" 'emb_25',\n",
" 'emb_26',\n",
" 'emb_27',\n",
" 'emb_28',\n",
" 'emb_29',\n",
" 'emb_30',\n",
" 'emb_31',\n",
" 'emb_32',\n",
" 'emb_33',\n",
" 'emb_34',\n",
" 'emb_35',\n",
" 'emb_36',\n",
" 'emb_37',\n",
" 'emb_38',\n",
" 'emb_39',\n",
" 'emb_40',\n",
" 'emb_41',\n",
" 'emb_42',\n",
" 'emb_43',\n",
" 'emb_44',\n",
" 'emb_45',\n",
" 'emb_46',\n",
" 'emb_47',\n",
" 'emb_48',\n",
" 'emb_49',\n",
" 'emb_50',\n",
" 'emb_51',\n",
" 'emb_52',\n",
" 'emb_53',\n",
" 'emb_54',\n",
" 'emb_55',\n",
" 'emb_56',\n",
" 'emb_57',\n",
" 'emb_58',\n",
" 'emb_59',\n",
" 'emb_60',\n",
" 'emb_61',\n",
" 'emb_62',\n",
" 'emb_63',\n",
" 'emb_64',\n",
" 'emb_65',\n",
" 'emb_66',\n",
" 'emb_67',\n",
" 'emb_68',\n",
" 'emb_69',\n",
" 'emb_70',\n",
" 'emb_71',\n",
" 'emb_72',\n",
" 'emb_73',\n",
" 'emb_74',\n",
" 'emb_75',\n",
" 'emb_76',\n",
" 'emb_77',\n",
" 'emb_78',\n",
" 'emb_79',\n",
" 'emb_80',\n",
" 'emb_81',\n",
" 'emb_82',\n",
" 'emb_83',\n",
" 'emb_84',\n",
" 'emb_85',\n",
" 'emb_86',\n",
" 'emb_87',\n",
" 'emb_88',\n",
" 'emb_89',\n",
" 'emb_90',\n",
" 'emb_91',\n",
" 'emb_92',\n",
" 'emb_93',\n",
" 'emb_94',\n",
" 'emb_95',\n",
" 'emb_96',\n",
" 'emb_97',\n",
" 'emb_98',\n",
" 'emb_99']"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"item_emb_cols"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "mooc",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.2"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}