You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
store/第九章实战.ipynb

1773 lines
233 KiB

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "61641ecb",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_id</th>\n",
" <th>item_id</th>\n",
" <th>category_id</th>\n",
" <th>behavior_type</th>\n",
" <th>time</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2268318</td>\n",
" <td>2520377</td>\n",
" <td>pv</td>\n",
" <td>1511544070</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2333346</td>\n",
" <td>2520771</td>\n",
" <td>pv</td>\n",
" <td>1511561733</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>2576651</td>\n",
" <td>149192</td>\n",
" <td>pv</td>\n",
" <td>1511572885</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>3830808</td>\n",
" <td>4181361</td>\n",
" <td>pv</td>\n",
" <td>1511593493</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>4365585</td>\n",
" <td>2520377</td>\n",
" <td>pv</td>\n",
" <td>1511596146</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" user_id item_id category_id behavior_type time\n",
"0 1 2268318 2520377 pv 1511544070\n",
"1 1 2333346 2520771 pv 1511561733\n",
"2 1 2576651 149192 pv 1511572885\n",
"3 1 3830808 4181361 pv 1511593493\n",
"4 1 4365585 2520377 pv 1511596146"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import datetime\n",
"import matplotlib\n",
"data = pd.read_csv('UserBehavior.csv', header=None, nrows=200000,\n",
" names=['user_id', 'item_id', 'category_id', 'behavior_type', 'time']) #由于数据量过大这里只导入20万条数据\n",
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "57d6c412",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"user_id 0\n",
"item_id 0\n",
"category_id 0\n",
"behavior_type 0\n",
"time 0\n",
"dtype: int64"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "caf4c62c",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th>user_id</th>\n",
" </tr>\n",
" <tr>\n",
" <th>user_id</th>\n",
" <th>item_id</th>\n",
" <th>time</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [user_id]\n",
"Index: []"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"repeat = data.groupby(['user_id','item_id','time']).agg({'user_id':'count'})\n",
"repeat[repeat['user_id'] > 1]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "9e826afe",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_id</th>\n",
" <th>item_id</th>\n",
" <th>category_id</th>\n",
" <th>behavior_type</th>\n",
" <th>time</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2268318</td>\n",
" <td>2520377</td>\n",
" <td>pv</td>\n",
" <td>2017-11-25 01:21:10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2333346</td>\n",
" <td>2520771</td>\n",
" <td>pv</td>\n",
" <td>2017-11-25 06:15:33</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>2576651</td>\n",
" <td>149192</td>\n",
" <td>pv</td>\n",
" <td>2017-11-25 09:21:25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>3830808</td>\n",
" <td>4181361</td>\n",
" <td>pv</td>\n",
" <td>2017-11-25 15:04:53</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>4365585</td>\n",
" <td>2520377</td>\n",
" <td>pv</td>\n",
" <td>2017-11-25 15:49:06</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" user_id item_id category_id behavior_type time\n",
"0 1 2268318 2520377 pv 2017-11-25 01:21:10\n",
"1 1 2333346 2520771 pv 2017-11-25 06:15:33\n",
"2 1 2576651 149192 pv 2017-11-25 09:21:25\n",
"3 1 3830808 4181361 pv 2017-11-25 15:04:53\n",
"4 1 4365585 2520377 pv 2017-11-25 15:49:06"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data['time'] = pd.to_datetime(data['time'], unit='s') + datetime.timedelta(hours=8)\n",
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "d45943ca",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_id</th>\n",
" <th>item_id</th>\n",
" <th>category_id</th>\n",
" <th>behavior_type</th>\n",
" <th>time</th>\n",
" <th>date</th>\n",
" <th>hour</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2268318</td>\n",
" <td>2520377</td>\n",
" <td>pv</td>\n",
" <td>2017-11-25 01:21:10</td>\n",
" <td>2017-11-25</td>\n",
" <td>01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2333346</td>\n",
" <td>2520771</td>\n",
" <td>pv</td>\n",
" <td>2017-11-25 06:15:33</td>\n",
" <td>2017-11-25</td>\n",
" <td>06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>2576651</td>\n",
" <td>149192</td>\n",
" <td>pv</td>\n",
" <td>2017-11-25 09:21:25</td>\n",
" <td>2017-11-25</td>\n",
" <td>09</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>3830808</td>\n",
" <td>4181361</td>\n",
" <td>pv</td>\n",
" <td>2017-11-25 15:04:53</td>\n",
" <td>2017-11-25</td>\n",
" <td>15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>4365585</td>\n",
" <td>2520377</td>\n",
" <td>pv</td>\n",
" <td>2017-11-25 15:49:06</td>\n",
" <td>2017-11-25</td>\n",
" <td>15</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" user_id item_id category_id behavior_type time \\\n",
"0 1 2268318 2520377 pv 2017-11-25 01:21:10 \n",
"1 1 2333346 2520771 pv 2017-11-25 06:15:33 \n",
"2 1 2576651 149192 pv 2017-11-25 09:21:25 \n",
"3 1 3830808 4181361 pv 2017-11-25 15:04:53 \n",
"4 1 4365585 2520377 pv 2017-11-25 15:49:06 \n",
"\n",
" date hour \n",
"0 2017-11-25 01 \n",
"1 2017-11-25 06 \n",
"2 2017-11-25 09 \n",
"3 2017-11-25 15 \n",
"4 2017-11-25 15 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data['date'] = data['time'].map(lambda x: x.strftime('%Y-%m-%d %H').split(' ')[0]) # 设置日期列\n",
"data['hour'] = data['time'].map(lambda x: x.strftime('%Y-%m-%d %H').split(' ')[1]) # 设置时间列\n",
"pd.set_option('display.max_columns', 10)\n",
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "a78247b7",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_id</th>\n",
" <th>item_id</th>\n",
" <th>category_id</th>\n",
" <th>behavior_type</th>\n",
" <th>time</th>\n",
" <th>date</th>\n",
" <th>hour</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1000169</td>\n",
" <td>1328010</td>\n",
" <td>959452</td>\n",
" <td>pv</td>\n",
" <td>2017-09-11 16:16:39</td>\n",
" <td>2017-09-11</td>\n",
" <td>16</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1004259</td>\n",
" <td>3734552</td>\n",
" <td>1573426</td>\n",
" <td>pv</td>\n",
" <td>2017-11-17 21:22:30</td>\n",
" <td>2017-11-17</td>\n",
" <td>21</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1007503</td>\n",
" <td>2137467</td>\n",
" <td>2778281</td>\n",
" <td>pv</td>\n",
" <td>2017-11-19 06:36:15</td>\n",
" <td>2017-11-19</td>\n",
" <td>06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1006359</td>\n",
" <td>359872</td>\n",
" <td>84264</td>\n",
" <td>pv</td>\n",
" <td>2017-11-20 01:32:45</td>\n",
" <td>2017-11-20</td>\n",
" <td>01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1000801</td>\n",
" <td>1034143</td>\n",
" <td>2465336</td>\n",
" <td>pv</td>\n",
" <td>2017-11-20 22:15:14</td>\n",
" <td>2017-11-20</td>\n",
" <td>22</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1007609</td>\n",
" <td>4146999</td>\n",
" <td>235534</td>\n",
" <td>pv</td>\n",
" <td>2017-11-22 21:01:05</td>\n",
" <td>2017-11-22</td>\n",
" <td>21</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1007609</td>\n",
" <td>2903641</td>\n",
" <td>1379146</td>\n",
" <td>pv</td>\n",
" <td>2017-11-22 21:01:10</td>\n",
" <td>2017-11-22</td>\n",
" <td>21</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>1007609</td>\n",
" <td>1544812</td>\n",
" <td>235534</td>\n",
" <td>pv</td>\n",
" <td>2017-11-22 21:02:23</td>\n",
" <td>2017-11-22</td>\n",
" <td>21</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1007609</td>\n",
" <td>3422704</td>\n",
" <td>1379146</td>\n",
" <td>pv</td>\n",
" <td>2017-11-22 21:02:32</td>\n",
" <td>2017-11-22</td>\n",
" <td>21</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1000807</td>\n",
" <td>1662243</td>\n",
" <td>3354571</td>\n",
" <td>pv</td>\n",
" <td>2017-11-23 02:03:21</td>\n",
" <td>2017-11-23</td>\n",
" <td>02</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" user_id item_id category_id behavior_type time \\\n",
"0 1000169 1328010 959452 pv 2017-09-11 16:16:39 \n",
"1 1004259 3734552 1573426 pv 2017-11-17 21:22:30 \n",
"2 1007503 2137467 2778281 pv 2017-11-19 06:36:15 \n",
"3 1006359 359872 84264 pv 2017-11-20 01:32:45 \n",
"4 1000801 1034143 2465336 pv 2017-11-20 22:15:14 \n",
"5 1007609 4146999 235534 pv 2017-11-22 21:01:05 \n",
"6 1007609 2903641 1379146 pv 2017-11-22 21:01:10 \n",
"7 1007609 1544812 235534 pv 2017-11-22 21:02:23 \n",
"8 1007609 3422704 1379146 pv 2017-11-22 21:02:32 \n",
"9 1000807 1662243 3354571 pv 2017-11-23 02:03:21 \n",
"\n",
" date hour \n",
"0 2017-09-11 16 \n",
"1 2017-11-17 21 \n",
"2 2017-11-19 06 \n",
"3 2017-11-20 01 \n",
"4 2017-11-20 22 \n",
"5 2017-11-22 21 \n",
"6 2017-11-22 21 \n",
"7 2017-11-22 21 \n",
"8 2017-11-22 21 \n",
"9 2017-11-23 02 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = data.sort_values(by=['date', 'hour'], ascending=True)\n",
"data = data.reset_index(drop=True)\n",
"data.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "f95ac6e5",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_id</th>\n",
" <th>item_id</th>\n",
" <th>category_id</th>\n",
" <th>behavior_type</th>\n",
" <th>time</th>\n",
" <th>date</th>\n",
" <th>hour</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1000</td>\n",
" <td>1385281</td>\n",
" <td>2352202</td>\n",
" <td>pv</td>\n",
" <td>2017-11-25 00:44:13</td>\n",
" <td>2017-11-25</td>\n",
" <td>00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1000</td>\n",
" <td>5120034</td>\n",
" <td>1051370</td>\n",
" <td>cart</td>\n",
" <td>2017-11-25 00:47:14</td>\n",
" <td>2017-11-25</td>\n",
" <td>00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1000004</td>\n",
" <td>2156592</td>\n",
" <td>3607361</td>\n",
" <td>pv</td>\n",
" <td>2017-11-25 00:00:41</td>\n",
" <td>2017-11-25</td>\n",
" <td>00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1000004</td>\n",
" <td>1591982</td>\n",
" <td>672001</td>\n",
" <td>pv</td>\n",
" <td>2017-11-25 00:02:13</td>\n",
" <td>2017-11-25</td>\n",
" <td>00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1000084</td>\n",
" <td>850738</td>\n",
" <td>2058468</td>\n",
" <td>pv</td>\n",
" <td>2017-11-25 00:55:17</td>\n",
" <td>2017-11-25</td>\n",
" <td>00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1000084</td>\n",
" <td>4288055</td>\n",
" <td>144028</td>\n",
" <td>pv</td>\n",
" <td>2017-11-25 00:56:07</td>\n",
" <td>2017-11-25</td>\n",
" <td>00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1000084</td>\n",
" <td>4474837</td>\n",
" <td>144028</td>\n",
" <td>pv</td>\n",
" <td>2017-11-25 00:56:52</td>\n",
" <td>2017-11-25</td>\n",
" <td>00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>1000084</td>\n",
" <td>4288055</td>\n",
" <td>144028</td>\n",
" <td>pv</td>\n",
" <td>2017-11-25 00:57:27</td>\n",
" <td>2017-11-25</td>\n",
" <td>00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1000084</td>\n",
" <td>4474837</td>\n",
" <td>144028</td>\n",
" <td>pv</td>\n",
" <td>2017-11-25 00:58:59</td>\n",
" <td>2017-11-25</td>\n",
" <td>00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1000084</td>\n",
" <td>4288055</td>\n",
" <td>144028</td>\n",
" <td>pv</td>\n",
" <td>2017-11-25 00:59:09</td>\n",
" <td>2017-11-25</td>\n",
" <td>00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" user_id item_id category_id behavior_type time \\\n",
"0 1000 1385281 2352202 pv 2017-11-25 00:44:13 \n",
"1 1000 5120034 1051370 cart 2017-11-25 00:47:14 \n",
"2 1000004 2156592 3607361 pv 2017-11-25 00:00:41 \n",
"3 1000004 1591982 672001 pv 2017-11-25 00:02:13 \n",
"4 1000084 850738 2058468 pv 2017-11-25 00:55:17 \n",
"5 1000084 4288055 144028 pv 2017-11-25 00:56:07 \n",
"6 1000084 4474837 144028 pv 2017-11-25 00:56:52 \n",
"7 1000084 4288055 144028 pv 2017-11-25 00:57:27 \n",
"8 1000084 4474837 144028 pv 2017-11-25 00:58:59 \n",
"9 1000084 4288055 144028 pv 2017-11-25 00:59:09 \n",
"\n",
" date hour \n",
"0 2017-11-25 00 \n",
"1 2017-11-25 00 \n",
"2 2017-11-25 00 \n",
"3 2017-11-25 00 \n",
"4 2017-11-25 00 \n",
"5 2017-11-25 00 \n",
"6 2017-11-25 00 \n",
"7 2017-11-25 00 \n",
"8 2017-11-25 00 \n",
"9 2017-11-25 00 "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_bool = (data.loc[:, 'date'] > '2017-11-24') & (data.loc[:, 'date'] < '2017-12-04')\n",
"data = data.loc[df_bool, :].reset_index(drop=True)\n",
"data.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "d259d2b5",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_id</th>\n",
" <th>item_id</th>\n",
" <th>category_id</th>\n",
" <th>behavior_type</th>\n",
" <th>time</th>\n",
" <th>date</th>\n",
" <th>hour</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [user_id, item_id, category_id, behavior_type, time, date, hour]\n",
"Index: []"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"drop_data = data[(data['behavior_type'] != 'pv' ) &\n",
" (data['behavior_type'] != 'cart' ) &\n",
" (data['behavior_type'] != 'buy' ) &\n",
" (data['behavior_type'] != 'fav' )]\n",
"drop_data"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "7c502428",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"user_id False\n",
"item_id False\n",
"category_id False\n",
"behavior_type False\n",
"time False\n",
"date False\n",
"hour False\n",
"dtype: bool"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.isnull().any()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "061c1691",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0cAAAIbCAYAAAAggeW7AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB31klEQVR4nO3dd3gU1eLG8Xez6YQkJAEhEDBIBEGKUgQFEWxIk2JDFFHAygX1ij8LxYoNUFGsKFwRbKggojQpVxEUkCJEEBAEE0qAwAZC6s7vj7m7ZMkm2U2yqd/P8+yTzM7MmTMnIcy758wZi2EYhgAAAACgmvMr7woAAAAAQEVAOAIAAAAAEY4AAAAAQBLhCAAAAAAkEY4AAAAAQBLhCAAAAAAkEY4AAAAAQBLhCAAAAAAkEY4AwGNHjhzR2rVrtXbtWiUlJZV3daq89PR0paSk+Pw4O3bs8PkxAACVA+EIQJnYsGGD3nnnHb3zzjvatWuXx/stW7ZMjzzyiB555BH9888/Pqxh0X744Qd16tRJnTp10n/+858SlWWz2fTSSy/pp59+KqXaVR02m01jxoxRo0aN9O9//9unxzp69KjatWunTp06ae7cucrNzfXp8VA2Dh06pOXLl5d3NQBUQv7lXQEA1cPSpUv1+OOPS5IWLlyoJk2aeLTf5s2bNXnyZEnSvffe67P6eSIoKMjt957Izc3Vzp07tXr1ai1evFiLFy+WzWbTlVdeqWXLljm3S0lJUZ06dYos7/PPP9eNN97o8fEzMjKUkZHhVZ09ZbVaVbNmzVIrLywsTAsXLtSRI0c0Z84cPf3004qPjy+18vOaNGmSTp48qbVr1+rLL7/UwIEDfXKcs504cULZ2dkKDQ1VQECA/P39ZbFYilVWdna2MjMzna+oqCgFBwcXud+GDRt0zz33aNq0abrkkkuKdey87Ha7tm7dqmXLlumHH37QZ599prCwsAK3/+WXX9SxY8ciyw0LC1NaWlqRx96+fbtWrlyp77//XosXL5bVatXq1at18cUXe1T/3377TXPmzPFoW0/16tVL3bp1K9UyAfgW4QhAmQgJCXF+X6NGDY/3y7utJxd8vuRtOEpNTVXfvn119OhR/fXXX8rMzMy3zapVq7R06VJdffXVkqSAgACP6hIeHu5hrU2TJk3SuHHjvNrHU82aNdMff/xRauX5+flp3LhxuvXWW5Wbm6vJkyfrzTffLLXyHQ4fPuwst3Hjxpo+fXqxA0pOTo5ycnJ0+vRppaenq379+oVu/8ILL+ill14q1rGK8v3336tHjx6FbrNjxw716NFDR44cUefOnfXkk09q3LhxslqtRZZvGIaSk5O1c+dObd++Xdu2bdOWLVu0ceNGlxAzcuRIzZw5s8ByPP07EBoa6vz+xIkT+ueff/T3339r165d+uOPP7R161Zt3rw5X4DKzs5Wv379tHHjRkVHRxd5nD179jg/iCkt5557LuEIqGQIRwBKXUZGhgICAlwutPJe4HhyAeZQWFjIysqSv7+//PzOjBDOzs7W7t27PSo7MjJSdevW9bgugYGBHtXLwc/Pz2XYnNVqVdOmTXXRRRepbdu2at++vS6++GKXtsl7jHfeeUc333yzc3nfvn1q3bq1JNew6QlfBktve9E8ceONN+rhhx/WwYMHNXPmTD3//POKiIgo1WPcc889OnnypCTpr7/+KrSXw1uGYZRaWb7QsGFDDRo0SG+++aZycnL09NNP68cff9Tnn3+u6OhoJSYmasmSJcrKytLx48d1+PBhJSUlaf/+/frrr790+vTpAsuuUaOGzjvvPKWmpioxMVHNmzd3u13e3/tt27YpNjbWZf2cOXP0wAMPKCQkRBkZGbrgggu0d+/eAo8bHR2tFi1aqGXLlmrdurUuuugitWzZ0uPfz7xhbf/+/WrQoIFH+7nj7++v3NzcUv2dAlA2CEcASl3v3r31ww8/KCgoyBmSsrKynOuvu+46jwNS3v1atGghi8WinJwcZWVlKTs7Wxs3blSbNm2c2xw4cEAXXHCBR2Xfc889eueddzw7KcmlVyFvICtI3outESNGaOrUqUWGlOzsbOf3kZGRioyMdC4fP37c+b2nPUzuti/swu+5555z9jDt2bNH5557boFlnnvuufr77799Eo78/f01ZMgQvfzyy6pXr562bdumSy+9tNTKnzlzpubNm1dq5Xnrsssu06OPPqqwsDD5+/vLarW6/Z367bff9Mknn0gyh5Wed955+bbJyspyGVbXqFGjIo8fEhKiqVOn6rrrrtPtt9+uo0ePavny5WrXrp2+/vprJSQk6JVXXlFycnKh5cTGxuruu+9WfHy8GjdurCZNmnj8gYO//5lLkPDwcJffdelMePL391dwcLCGDx+usWPHymKxqHHjxmrRooXOOeccvf/++5Kkb7/91qNhegXJ+2/kiiuucKmftxz3ruX9sANA5UA4AlDq/Pz8FBwcrKCgIOfXU6dOOT9tDgsL8/iC2mazOfcLDw+X1WpVbm6usrKydOrUqXzDoLwJDd5euOS9ePUkHDnuIzEMw+P7QPKGwcKGznk7/MvbMOUNX10Ajhw5UjfddJPatm1bquXu3r1bo0ePliRFRUXp119/LXIYXGEMw1BOTo5yc3OVnp5eaK+KQ58+fdSnT58it/v444+d4ejmm2/WFVdcUex6unPddddpw4YN6t27t7Zu3ar9+/dr/fr1atOmjfr27aulS5fq/PPPV0JCgs4991w1atRIDRs21IMPPqjVq1crISFBEyZMKNaxPQ0fjg9S7rjjDnXr1k2tW7d2fvCwd+9eZzgqqbwf2Hja+1wUT/5OAKhYCEcASt2SJUvyvTdz5kzdeeedkqQvvvhCnTt39qis1157TQ899JAkac2aNUUOdcl7wTVs2DBNnz7dZf3evXudN/cHBAQoNzdX//zzj4KCghQYGOi8Od7xSb6fn58sFossFovLxZOfn5+zB+vkyZMKCAhQrVq18tXHarUqJyfHo3OVpFOnTjm/92RiBk95M5TRl2UfOHAg3/Cp0uTuZ56XzWZT3759ZbPZZLFY9NFHH7ntjSmus3s/KoNGjRpp9erV6tmzp26//XYNHz5ckvT2228XuE9pDNP0dOihY7sGDRqUaKhbUfL+Oy2tYXXe/NsHUDEQjgBUKd4MhfH391dKSkqhQ8cKctddd+muu+5yLv/73//WpEmT8m3nbTjK+4l1aV60+zIcefPpuC97sKTCL9ozMjJ0/fXXKzExUZL02GOPqVevXj6tT2URHh6u5cuXl+kwMHcTlLiTtzfVlzytjzd8NUMkAN8hHAEoc4MHD/Z4QoHU1FSvyvbm4rsk9xR469ixY0pMTFRubq4yMjKUnp6u48ePKy0tTbfddptzO8d9MOedd16hvRDeXnT5cniPN2Xnvfh+4IEHNHLkyBIf/+uvv9YTTzwhqeDJIdLT09W/f3+tXLlSkjmc7Nlnn9W7775balPEJyUl+bRXzNfK+v6YihaO0tPTnd/HxcWVSpl5e4IBVA6EIwBlbt++fT4r25vAY7VaVbt2bf35558KDAxUYGCgrFarAgICnEPqHD0uFotFa9as0ZVXXilJeu+993TLLbcoJydHp06dKnKY0fvvv+/23ojo6GhnONq5c6dmzJghSRowYEC+bfPeg/TSSy8pIyNDQUFBSk9Pl8ViUc+ePQs8ft57lBwTW7iT94K1VatWhQYfm81W4LqC5A2vMTExatasmddlnK1evXrO791d4NtsNnXv3l0bNmyQZJ7XZ599JqvVqhMnTpT4+A6eTBddEezdu1cDBgzQSy+95JxCPq/Dhw/L399foaGhziGmxZniPCcnR9nZ2Tp58qTCwsLyfSCS93etffv2+Xo3HWHFFz067jRu3LjUHzrcqlWrUi0PgO8RjgCUuR9//LFY9xx5wpuLOEf4SUhI8Gj7s6fydjz41N29RmeLjIxUrVq1nDfvZ2dnO6c8l8x7cfr3769Tp04pNDRUDz74YL4yoqKidO2112rx4sVatGiRFi1a5Fw3YcIEj8ORp6GmqAdvFoevh9W5C3Ph4eEaP36
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"\n",
"pv_daily = data[data['behavior_type'] == 'pv'].groupby('date')['user_id'].count()\n",
"pv_daily = pv_daily.reset_index().rename(columns={'user_id': 'pv'})\n",
"uv_daily = data.groupby('date')['user_id'].apply(lambda x: x.drop_duplicates().count())\n",
"uv_daily = uv_daily.reset_index().rename(columns={'user_id': 'uv'})\n",
"x = pv_daily['date']\n",
"y1 = pv_daily['pv']\n",
"y2 = uv_daily['uv']\n",
"fig = plt.figure(figsize=(10, 6))\n",
"matplotlib.rcParams['font.sans-serif'] = ['SimHei'] \n",
"matplotlib.rcParams['font.family']='sans-serif'\n",
"plt.subplot(1, 1, 1)\n",
"plt.plot(x, y1, label='访问量', linewidth=1.8, color='r', marker='o', markersize=4)\n",
"plt.plot(x, y2, label='用户量', linewidth=1.8, color='g', linestyle='-.', marker='^', markersize=4)\n",
"plt.legend(loc='best')\n",
"plt.title(\"某时段用户每天活跃量\", fontsize=24)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "85567ce1",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"{'浏览次数': user_id\n",
" user_id \n",
" 1 55\n",
" 100 84\n",
" 1000 67\n",
" 10001 21\n",
" 10008 41\n",
" ... ...\n",
" 1008883 395\n",
" 1008891 69\n",
" 1008899 86\n",
" 1008904 130\n",
" 1008905 7\n",
" \n",
" [1967 rows x 1 columns],\n",
" '加购次数': user_id\n",
" 1000 2\n",
" 10008 12\n",
" 10009 11\n",
" 10020 2\n",
" 10021 17\n",
" ..\n",
" 1008882 13\n",
" 1008883 10\n",
" 1008891 3\n",
" 1008899 1\n",
" 1008904 20\n",
" Name: behavior_type, Length: 1466, dtype: int64,\n",
" '收藏次数': user_id\n",
" 100 6\n",
" 1000 12\n",
" 10013 38\n",
" 10020 1\n",
" 10024 2\n",
" ..\n",
" 1008829 5\n",
" 1008830 4\n",
" 1008849 2\n",
" 1008860 1\n",
" 1008882 1\n",
" Name: behavior_type, Length: 725, dtype: int64,\n",
" '购买次数': user_id\n",
" 100 8\n",
" 10008 3\n",
" 10009 10\n",
" 10020 1\n",
" 10021 1\n",
" ..\n",
" 1008882 2\n",
" 1008883 3\n",
" 1008891 5\n",
" 1008899 1\n",
" 1008904 7\n",
" Name: behavior_type, Length: 1369, dtype: int64}"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from pandas import DataFrame\n",
"import pandas as pd\n",
"user = {}\n",
"user['浏览次数'] = data[data['behavior_type'] == 'pv'].groupby('user_id').agg({'user_id':'count'})\n",
"user['加购次数'] = data[data['behavior_type'] == 'cart'].groupby('user_id')['behavior_type'].count()\n",
"user['收藏次数'] = data[data['behavior_type'] == 'fav'].groupby('user_id')['behavior_type'].count()\n",
"user['购买次数'] = data[data['behavior_type'] == 'buy'].groupby('user_id')['behavior_type'].count()\n",
"# data_1 = DataFrame(user,index=[0])\n",
"# data_1\n",
"user"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "afaa636a",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"user_id 1973\n",
"item_id 117031\n",
"category_id 3980\n",
"dtype: int64 \n",
" 用户行为数: 199908\n"
]
}
],
"source": [
"base_count =data[['user_id','item_id','category_id']].nunique()\n",
"behaviour_count = data['behavior_type'].count()\n",
"print(base_count,'\\n','用户行为数:',behaviour_count)\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "40768d9f",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"np.float64(91.09934110491638)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"behaviour_group = data.groupby(['behavior_type']).count()\n",
"behaviour_group # 将用户每种行为分类\n",
"PV = behaviour_group[3:4]['user_id'].values[0]\n",
"UV = base_count[0:1].values[0]\n",
"PV/UV # 计算访问量与点击量次数比"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "6c4b9afb",
"metadata": {
"pycharm": {
"name": "#%%\n"
},
"scrolled": false
},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA/gAAAJQCAYAAAA648+gAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACPTUlEQVR4nOzdd2AUZf7H8c/sbiohlRqIEIoISFEIIlKkeHIWLNgQTlGBQ/RAUfSQZuFHE2knVjjwUETFggVEUeEUKYJIC3CUgHQCCQkQ0nbn98cmSzaNBJJssnm/7tbszvPMzHd2SPnMMzNrmKZpCgAAAAAAVGgWTxcAAAAAAAAuHwEfAAAAAAAvQMAHAAAAAMALEPABAAAAAPACBHwAAAAAALwAAR8AAAAAAC9AwAcAAAAAwAsQ8AEAAAAA8AIEfAAAKiC73S7TNC9p3sTERKWmppZwReXH/v37PV0CAAAeQcAHAKCY3nrrLb322mvFDtipqamy2+2F9jFNU2lpaRcN4J9++qmuvPJKHT9+vFg1SNJLL72kjh076vz585Kk9PR0ZWRk5OmXnp6ulJSUfJeRczvOnTun+vXr68033yxWHTt27Mh3+qZNmwqd7/fff9d9992n9PT0PG1btmxRo0aN9OOPPxa6DNM09cQTT7jVnJKSorS0NDkcjiJUn79z586pQ4cO+u9//3vRvpmZma73ce7cua7tPnv2rCTpzJkz+vHHHy/6bwYAgGwEfAAAiiE+Pl4vvPCCfv31Vx04cED79+/P97F3715t377dbd7bbrtNNptNhmEU+LBYLPL399fgwYMLrePDDz/UNddco5o1axar/tjYWL3xxht69tlnFRAQIEmaMGGCfH1989Ti5+enbt265VnGypUr1aJFC9dIeVpamg4cOKCgoKAi13Hu3Dk1b95c//nPf9ymZ2Rk6IYbbtDzzz9f4Lw1atTQjz/+qJEjR+Zpa9mypW655RYNHz680AMwhmHIbrfrqaee0urVqyVJjz76qPz9/WW1WvPdN//6179c8+/atUuTJk3Ks9wffvhB69evV1RU1EXfg+eee861n6dOnap169Zp9+7dioqKUlJSkjZs2KDu3btr165dF10WAACSZJiXen4fAACVjGmauv3227V8+XJVqVKl0L52u12pqaluI+NHjhyRaZqyWq2uaQMHDlRgYKBmzpzpWkdaWpoCAgIKDO9bt25V69atCxxptlqtWr9+va699lq36SkpKbr++uvVoEEDff75567pCQkJOnv2rHx8fGQYhlsdVqs1T1iNj49Xly5dlJqaql9++UWGYSgyMlI//fSTbrzxxkLfl2wbNmxQ+/btdeLECYWHh7um//jjj+rRo4d27dqlxo0bFzj/559/rt69e2vlypVq06aN/Pz8ZLPZJEl//PGHZs2apRkzZig4OFiS88CB3W6Xv7+/axnnz5/XVVddpeuvv16LFi3SsWPHlJ6eLpvNJovFfQykbt26mj17tv7+979LkhYsWKCHHnpIH374oR544AFXv/vvv1+GYWjRokUXfQ9OnDihVq1aadGiRfrHP/6hp556Shs2bJDD4dBbb72lV199VfPmzVNsbGwR3lEAACSbpwsAAKCiGDFihJYvX65ly5apR48exZ4/MjIyzzQ/Pz8FBASoVq1aRVqGaZp6+umndc0112jv3r3atGmTQkNDJUnJycnq2LGjHnjggTzhXpIGDBigQ4cOafny5W7Tw8PD3UL2xVSvXl3ff/+9YmJiNHbsWD311FOSVKSzCVJSUnTkyBH9/PPPat68uRISEpSQkKDQ0FBVq1ZN7733niIjI/XDDz/ohx9+cM130003qWHDhq7Xd911l6ZPn66WLVuqe/fuWrduXZ51zZs3z+31M888o6lTp7peBwQE6P3339c111wjSRfdB76+vq7nf/vb3/Trr79q4MCBuvbaa3XllVcqPj5eX3zxhdLT0/XRRx/lmb9JkybauXOnJOfp9x988IEGDx6sHTt26PTp01q1apWqV68uHx8f/frrr1qxYoW6dOmi06dPu5aRmppa5H8rAIDKh4APAMBFmKapYcOG6fXXX9f8+fM1e/Zs3XTTTYXO889//lMTJ04s8VqmTp2qLVu2aMeOHerfv79efvll/fvf/5bD4dBjjz2m0NBQvfTSS3nmO3r0qL766it98sknCg4OVpcuXTRu3Lh8T8EvTHp6ukzTVGRkpL777js1bNhQa9eulSTVrl37ovP/8ssvuvnmm12vs0fpn3vuOY0aNUqLFy9W3bp1NWfOHFefLVu2aNGiRW4BX5KGDRsmSfrkk0+UmZkpX19f1atXTx9//LE6dOjg6pd9NkL2JQSpqalq1KiRJkyYoIceeqjI2559dkO26dOna9WqVerXr5/WrVun1157TdHR0fr222/zzNupUyfdeeedrtepqan65ptvFBAQoF9//VUJCQmKjY3VFVdcoczMTF1xxRX6+eef9d133+mtt95yzWe1WpWZmVnkmgEAlQsBHwCAizAMQ1dddZXee+89/e1vf9P8+fM1d+5c3X333Xrsscd0991369Zbb3X1f/DBB/OMiMfFxclisbidni85TxO3WCw6dOiQ23SHw6G0tDS309RXrFihkSNHavHixYqIiNCMGTN07bXXqmPHjlqzZo3Wrl2rX3/91XVtfU61a9fW1q1bVb9+fY0ZM0Zr1qyR1WrV0aNHdezYMfn5+eU5LV26cGp769atJUlTpkzRmDFj8n2fwsLC8p3ep08fLVy4UJLzjAVJbtfH9+jRQ35+fnr99dcVFBSkTZs2KTAw0K327NHzL774QrNnz5afn5/8/f21ePHiPJcQhIeHFzrK7evrq8OHD7uCcnx8vM6dO5enX2GXSUiSv7+/5s6dq+TkZB06dEivv/663n33XdWvX1+ffPKJGjRooDZt2mj37t06dOiQbrvtNte81atX14oVKzR37lz9+uuvqlOnjm666SalpKRo+vTpev/992W325WSkqKAgAB9+OGHevHFF7Vly5YC6wEAgIAPAEARDBkyxPU8KSlJkZGRCg0NVdOmTdW/f3/95z//UZ8+fSQ5R7lzB/y//vWvhd4s7ZNPPskzLfdo7Y033qhFixa5RoIbNmyoadOm6bHHHlNAQIB+//131atXr8B11K9fX/v27dPUqVP1j3/8Q126dNHkyZM1btw4BQQEuA4+pKSkKCMjQyEhIUpLS1OVKlV07NgxSc7T/O+44w75+vq6+o8bN047duzQxx9/nGedPXv2VJ06ddy2KT/x8fF6//339dprr7mFe8l5kCHngYEzZ85o3759OnXqVJ7lmKapw4cPa8+ePW7TIyMjXcvNPtCSfUBj6NCh+V4zf/PNN+c7Gp/T9ddfL8n576NVq1au6/H/7//+T9dcc43mzZunr7/+WjVq1HD1lZwfVfjCCy/o008/1fLlyzVgwABVq1ZN77//vp566ilt3bpV6enpOn/+vAICAnT06FHVqFHD9T4AAJAf7qIPAEARpKWluY34Zl/3Pn78eA0bNsztRmjnzp1zhePsj7tbvXq1Tp8+rTNnzrgeR44cka+vr/z9/RUXF+fWlpSUpCNHjrjVYLPZdM8990hy3hhvwoQJeuaZZ9S5c2dVrVpVDzzwgN577z0lJyfnuw2ZmZnq16+fUlNTNWDAAEnS888/r9TUVCUmJurkyZM6efKkhg4dquuuu04nT57UmTNnXOFecl6n3qJFCzVp0kSNGjVSo0aNtG3bNt14442u1zkfiYmJ+Y6mnz171vWw2+2qWbOmlixZoubNm+v22293u4Fgenq66+Z4d911l9auXauRI0e6XRMvOQ9MOBwO9evXT40bN3Z7bN68ucB96+fnp4cfflimaboeDz/8cJ7l58c0TR09elSzZ8/W559/7jqNPzMz03VgIzuw5zy4ERcXp0OHDmn9+vWqWrWqWrVqpWuvvVZffvml6tev77oeP/uj844eParo6OiL1gMAqNwI+AAAFMEzzzzjusv8wYMHdf3117s+Pu21117T+PHjXa/Xrl2re++9V/7+/q4b0EVERCgkJER
"text/plain": [
"<Figure size 1200x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# 确保您的数据集 'data' 包含 'date_column' 列,并且该列是日期时间格式\n",
"data['date_column'] = pd.to_datetime(data['date'])\n",
"\n",
"# 使用 groupby 方法按 'behavior_type' 分组\n",
"count_by_behav = data.groupby('behavior_type')\n",
"\n",
"plt.figure(figsize=(12,6))\n",
"\n",
"# 遍历每个行为类型的组\n",
"for group_name, group_data in count_by_behav:\n",
" # 设置日期时间列为索引\n",
" group_data = group_data.set_index('date_column')\n",
" \n",
" # 使用 resample 方法按天计数\n",
" count_by_day = group_data.resample('D').count()['behavior_type']\n",
" x = count_by_day.index\n",
" y = count_by_day.values\n",
" \n",
" # 绘制图表,使用日期索引而不是 range(len(x))\n",
" plt.plot(x, y, label=group_name)\n",
"\n",
"# 设置 x 轴刻度标签为日期,并旋转 45 度\n",
"plt.xticks(rotation=45)\n",
"plt.legend(loc='best')\n",
"plt.xlabel('日期', fontsize=12)\n",
"plt.ylabel('行为次数', fontsize=12)\n",
"plt.title('每天各行为的访问次数')\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "083d0a20",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAqcAAAIrCAYAAADSlzCnAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABs+UlEQVR4nO3dd3gU5f7+8XuzKSSBhN4SmgEp9gICR8ROEAVBLCCoKKCC0iQoSlURBUGCiigqgoJ4RBQEpYoIygFBRZSiIELoNY2Qtju/P/jtfrNkW8ImGZL367r2Omdnnp35zLPB3JmZ5xmLYRiGAAAAABMIKukCAAAAAAfCKQAAAEyDcAoAAADTIJwCAADANAinAAAAMA3CKQAAAEyDcAoAAADTIJwCAADANAinQB42m03Lli3T8uXLS7qUQktLS9P//vc/bdiwoaRLgRvJycklXYIp2e12TZo0KWD9YxiG3n//ff39998B2Z4nGRkZ+vTTT5WWlua13VtvvaUjR46c9/68PTcnJydHNpvtvPdREDabTUlJScW6T5R+Fp4QBfyf9evX64YbblB0dLR27typKlWqlHRJBbZ582Zde+21ioyMVHp6epHtp3///po2bZreeust9e/fv9DbSU9P1/79+wv9+djYWJUvX75An0lJSdFvv/2mpUuXavHixVq3bp2io6MLXUNB3HLLLUpLS9O7776rq666qlj2WRAl8X1I0sMPP6zZs2fr9ttv1zfffCOr1VroGiRp1KhReumll3T11Vdr/fr1Cg0NlXT2+Ox2u8qVKyer1Xre+9m0aZOaN2+uqKgoHThwwO2xJycnq1KlSrJarVq5cqVuvPHGAu/Hbrdr7Nix2rJli7788ktZLJZ8bebPn69Ro0Zp5MiRuvfeexUcHFyYQ3LLMAwdOXJEe/bs0T///KMtW7Zow4YN2rx5s7Kzs/XTTz/p2muvDdj+ULYF7icXKAVatWqlfv36aerUqRo7dqymTp1a6G1t375dzZo1K/DnDh8+rBo1ahR6v5GRkZKk8PDwQm+jIPspTBDJa82aNbrzzjsL/flvvvlG7du3d7vO8Qv02LFj2rdvn/7991/99ddf+vfff13azZ49W08//XSh9j9kyBDZ7XaP65s1a6a+fftKks6cOaOffvpJwcHBatCgQYH2M3HiRJ05c0YhISFug0lBhISEaMiQIW63U5TfhzejR4/W4sWLtXz5ciUkJGjy5MmFrkGSBg0apFmzZumXX37R8OHDNWnSJOfyDz74wO/t3H///Zo3b57H9du3b5cktW/f3uO/hR9++EGSdNFFF6lNmzZ+7zuvEydO6J133tGxY8c0ceJEDRs2LF+bzz77TNu3b9err76qzp07+wynNptNr7/+unJzc2Wz2ZSdna3MzExlZGQoLS1NqampOn78uI4fP66kpCSdOXMm3zZCQ0MVHR2tgQMH6ptvvim2P/JQuhFOUWbs27dPV199tcqXL6+QkBCPZ0wcZxunT5/u8fJ+VlaW8zVt2jTdf//9+dpUq1ZN0tmQWLduXb/qO3PmjCIiIvw9JLccv5C8nREyDEO5ubk6c+aMQkNDVa5cuQLvx1Hn+Z55coToZ555Rq+//rrfnxs6dKgmTZrktb/++usvPf744873FotFhmGoRo0a6tGjhxo1aqT69esrLi6u0PW/8847yszM9Li+c+fOznC6fPlyZWZm6oknnlDFihULtJ93331Xu3fvLnSded1xxx165pln3K4ryu/Dm4suukhz5szRHXfcoTfeeENXX321evToUahtSVLlypX10Ucf6eabb9Ybb7yh22+/Xe3atVNMTIyuvvpqhYeHKzg4WFar1W1IP3TokLZt2+Y84+rJtm3bJEldu3b12GbFihWSpAEDBhT630u1atX0/vvvq1OnTho5cqTuuOMOXXrppc71p06d0uLFi2WxWDRjxgy//k1brVatWrXKWZ9DUFCQwsPDFRQUpLS0NLVs2VK33367atWqpZiYGMXGxio2NlY1atQo8M8x4A/CKcoMwzCUmpqq3Nxcl19KhmEoJyfH5ZeQ48ylu/vfDMNQdna285Wbm+t2f44ziy1atND333/vs74bb7xRa9as8fnL0F9Hjhzx6wzbzJkz9cgjj+RbfujQIdlsNkVHR6t8+fIFPltns9mUmZkpi8XiNbA4folOmjTJeXarILydIb7jjjs0depU1atXTw0aNFBcXJwiIyPVpEkTt8Fr7Nix+vTTTz1uLyoqShs3bnRZFhISopiYGO3atctleWZmpsLDwxUSEuJc9tlnn0mSHnvsMb+OLa/NmzerXLlyzp/dgjIMQ40aNdLu3bu9hr6i/D58iY+P19NPP601a9bokksuKfR2HG666SY98MADys7OVvXq1SWd/Y7Hjh3r87MfffSRevXqpbCwMK/ttm3bppCQELVr187tesMw9OWXXyooKEgNGjTQunXr8rWJiIjQ1Vdf7bOmjh076q677tLXX3+tV155RXPnznWumz17tjIzM3XfffepRYsWPrfl8P777+vEiROqWLGioqKiFBkZqXLlyslms+n222/Xd999p7/++kvvvfeeLrvsMr+3C5wPwinKjHr16ik7O9tl2d69e/Xggw/KMAx9++23ioqKyve5Tp06KSQkRI8//rhuu+02v/fn65eaJ0FBgRmnGBoa6vESos1m05kzZ3TmzBmPZz5Gjhzp1+XPnj17qmfPnh7XP/vss3r11Vc9rndcEu/Vq5fbS5WeTJgwQTNnzvQ6QCQyMrJAl+tTU1O1c+dOj+tr1qyZb1ne8OmO40z2qVOntHDhQklS8+bNvX5my5Ytuvzyy12W5b1cmp2draeeekrPPvtsvrO+GzduVOfOndWkSRN98sknqlWrlqSzl5Z3796tChUq6O677/a476L8PvzxyiuvKCQkpNB/pD3xxBP6448/NHToUN19992aO3fued0G4a6OESNGaP78+ZLO/jdEyv+dfvHFF7rkkkv0448/6sCBA5Lk8XaJK664Qr/99lu+5Y77Yx1neS0WiyZNmqTWrVtr6NChznaGYWjatGkKCgpyBm/H1ZHMzEzZ7XaPl9vr1q2b78qO3W5X//799d1336levXr68MMPXYLpsWPHNGzYMD3wwAMeQzlwPginKNNiYmJ05ZVX6u2339btt9+ulStXutw3tnv3bn399deqUKFCgc8iebsP0ZvzvZ/QoVKlSlq5cmWhP1++fHnVqVNHoaGhbu9dO378uE6cOKFatWq5DfW5ubnKyspynkH2JCcnp9A1SmdvsQgUx1nD1atX5xu0YrFY3P7Bkff7evnll3X48GE98sgjzkuujrOcH3zwgTIyMlSzZk2PQcFxa4ev+3i/+OILzZgxQ5988olefPFFDR482LmfnJwcHTx4UGfOnHEJ0zNnzpQk3XvvvV7Pbhbl9zF37lw9/fTTCgsLU7ly5Qr1h1i1atW0fv16j+sPHjyoH3/80Xl2+nz/Pbn7fHJycr4/Yjz9UTN9+nRJ0g033OC81SevL774wuMl+FdffVXjxo1zu2748OFulzdt2jTfMl/3zeZ16tQp9erVSwsXLlSPHj00bdo0VahQwbl+6dKlevTRR3Xo0CHNmjVLr732mhISEvzaNuAvwinKtODgYL311lvKysrSkSNHdOzYMZdg8MEHH8gwDI0dO1b16tUr0LbPPUvrL5vNFtBRtoU1ZcoUTZkyxeP6MWPGaOzYsZowYcJ53RuYkZEh6Wx4cgSoggjkjAS++t3d+rwB66efftK3336rdu3aOcNpUFCQsrOz9fbbbysiIkKbN29W7dq13W6/TZs2Wrdunc+zht26dVPFihX12GOPKSEhQQsWLNDs2bPVsGFDZ7i88sorXYJVZmamKlasqEcffdTrtovy+8jJydHp06edZ/X8DadHjx51npH19Uef4xaSvAE8MTFRgwYN8rmfxMREDRgwwGc7xx8pn3/+eb57TW+99VatWrVKYWFh2r9/v+bPn69q1app2bJl+UJobm6uQkJCPJ59j4iIUO3atRUeHu723tjDhw8rJSVFderUyXfrjGEYzkF
"text/plain": [
"<Figure size 800x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"pv_item = data[data['behavior_type'] == 'pv'].groupby('item_id')['user_id'].count().sort_values(ascending=False)\n",
"buy_item = data[data['behavior_type'] == 'buy'].groupby('item_id')['user_id'].count().sort_values(ascending=False)\n",
"merge2 = pd.merge(pv_item, buy_item, on='item_id', how='outer').fillna(0)\n",
"x = merge2['user_id_x']\n",
"y = merge2['user_id_y']\n",
"plt.figure(figsize=(8, 6))\n",
"plt.scatter(x, y, marker='o', color='g')\n",
"plt.xlabel(\"商品点击量\", fontsize=14)\n",
"plt.ylabel(\"商品购买量\", fontsize=14)\n",
"plt.title(\"商品点击量和购买量之间的关系\", fontsize=18)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "0531ec36",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0cAAAIbCAYAAAAggeW7AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACkVklEQVR4nOzde3zO5R/H8de98xw25hSSQ6SUUyGURIpyPlSkk1JJolBSISpKUlRSiH5yKuYUIoWklUMMDUkjOR83M5vZvr8/ru57u21ms3v77vB+Ph73Y9f3cH/vzz3bfD/3dV2fy2FZloWIiIiIiEgB52V3ACIiIiIiIrmBkiMRERERERGUHImIiIiIiABKjkRERERERAAlRyIiIiIiIoCSIxEREREREUDJkYiIiIiICKDkSEREREREBAAfuwMQEQE4deoU3t7eBAUF2R2K2ODChQv4+OTN/5IOHDjA/v37AahatSolS5bM1PNPnjxJSEhIdoQmIiKZpJ4jEbHVwYMH6d27NxUrVmTMmDF2h1PgnTx5ku+//56wsDA2b96cI68ZHx9PlSpVGDhwINu2bcuR1/Skr776ikaNGtGoUSNWrFiR4ecdP36chx56iIoVKxIREZGNEWZOYmIis2bNYtKkSXaHkueFh4dny79tQkIC33//PT169GDq1Kkev75IQeawLMuyOwgR8bzY2FjOnz+fLdf29fWlcOHCHrlWTEwMlSpV4sSJExQrVox9+/blu96jkydPYlkWhQoVwtfXF29vbxwOxxVd6/z588THx7seZcqU8WiPy6ZNm6hXrx4AderUyZEE6euvv+bBBx8E4Pbbb2f16tV4e3tn++t6yocffsiLL74IwNy5c+ncuXOGnvfGG28wfPhwAG644QbWr19PkSJFXMePHDnC2bNnCQgIwN/fH29v70z97Hh5eVGoUKEMnbt//342btzI8uXLWbJkCf/++y8hISH8/fffBAcHu8678cYbL3uz37t3bz755JMMvS7ApEmTmDdvHlOnTqVs2bIZft6lxMXFsX79epYtW8apU6eYOHFiuue/++67vPLKK5e9bps2bVi8eHG658THx/P777/z448/smjRItavX891113Hb7/9RrFixTLzNtK1c+dObrzxRpKSkqhSpQq7d+/Gy0ufd4t4hCUi+dKTTz5pAdnyaNWqlUdjHTlypOva7733nkevnRs8+OCD2fZvsWPHDo/GumPHDte1n3nmGY9e+1Luu+8+C7B8fX09/n5ywoQJE1zfs8WLF2f4eefPn7fq1avnem737t3djvfr1y9LPxvVq1dP9/UfeeQRq3bt2lZwcPAlrzFkyBC359SuXfuyr/vKK69k+Hvw9ddfW15eXhZglShRwpo7d26Gn5uQkGDt3r3bWrJkifX+++9bPXr0sG6++WbL19fXLZ6pU6eme52PPvooQ9/PBx54wLIsy0pMTLSOHDlibdq0yZo7d641atQo65FHHrHq1KmT6rWdj9atW1tJSUkZfm8Z0aZNG9f1V6xY4dFrixRkeXOAt4hcVkBAQLZd29/f36PXe+aZZxg+fDjx8fF8/PHH9O/fX5+C2sTPz8/Vrly5cra/3t9//+0aivbss89y/fXXZ8vrxMbG8s8//2To3FKlSlGiRIkMXzvl98zX1zfDz/P19WXWrFnUrl2b2NhYZsyYQdOmTXnqqafSPN/Lywtvb+/L/m7Ex8cDl/893bZtG+Hh4a7ta665hrp163LzzTdTv3596tevn2r+lPO9Pvzww3z00Udux2rVqsX+/fsJDAxM93VTatiwIc2aNeOHH37gxIkTdOnShf79+zN69Gi8vb1ZsWIFERERxMbGcurUKQ4fPsyBAwf4559/2LdvHxcuXLjktUuUKEHVqlVZtWoVDzzwwCV70Zz7K1euzO+//57qeO/evZk1axaBgYFs376dunXrpvu611xzDTfeeCM1a9akTp061K1bl+rVq1+2xy86OpqDBw+me05Kt99+O99++y0An3/+ORUqVEh1TlJSkqu3OS4uDj8/Pxo1apTh1xApiJQcieRTzps0b2/vdP8j79mzJ1OmTAHAuswoW+d/7p5OjkJCQmjfvj1ff/01QUFBREZGcu2113r0Nex03333UblyZQoVKoSPj88lb3B/+OEHvvvuOwBeffVVihcv7nbcsqxUw+ouPierUg7Ry4kiAS+++KLr53P8+PGMHz8+U8/38fHh119/5ZZbbkn3vI0bN9K0adMMXXPUqFEZGmbllPKmN7NJfdWqVRkwYABvvvkmAG+++Sbdu3enUKFCjBgxgqFDhxIYGJiphKNSpUrs27fPLWlLi3NobK1atVi9enWGfpYSEhIAKFKkSKphYs73npkEsUKFCnz//fe8++67vP766yQmJjJ27FjCw8OZM2cOpUqVcg1ZTM+tt95Kx44dqVKlClWqVKFq1apuwwHT4/yZ9/LySnPom/P76OPjw0033cSdd97JypUr8fPz4/rrr6dGjRpER0ezdOlS/P392bdvX4bff0rLli2ja9euV/TcuXPnMnfu3Mued8stt7Bx48Yreg2RgkLJkUg+lZkblMy63E3XlRg2bBgjRoygevXqHr+23R599NEMnRcXF+dKjp566ikqVarkkdfftWvXZRNfp8OHD7vaJ0+eZOfOnRl+nWLFinHVVVdl+PwVK1awaNGiDJ+flpdeeumyiRFk7vchsz/fKROiK+nxHDRoEF988QX33Xcf77zzjqsnI6tz7y73IYbzexIYGJjhJNs5jzG92DI7n87hcPDKK69Qv359unTpwunTp9myZQuRkZHUq1ePWrVqYVkW1apVo1q1alSsWJFrrrmGihUrUqdOHRITE2nVqhWDBg3K1Os6ZXTOnnMe3JgxY3A4HNxwww2u7+G0adNYunTpFb2+U8p/r6uuusojH0IlJiZy4cIF14cpeWkun4hdlByJ5FPZ+Z9gRq8dFhZG48aNsy2ON998k9dffz3brp9f1KxZ0/WJf2a88sormepB6dOnT6qhVpcSGxtL3759AXOjHR0dDUBoaCjNmjW75PP27t1L8+bNOXXqFLfccgvDhg3L0OulvAFO6+dm9erVrtf19fUlPj6eI0eO4O/vj5+fHz4+Pm69fs7fAYfD4fb74HA4SEhIID4+npiYGAoXLkzRokVdx8PCwli3bh0ATZo04dZbb6Vw4cL88ccfGe7pyKjLJXlXUsjj7NmzAJQuXfqKYkrPXXfdxc8//0zbtm2ZN28edevWBXAb+pcdMvrBgfO82rVrZ0scKRP42bNnp+rpjI+PZ8OGDQCULVv2kr3rv/zyC3/++ScADz74YKZ6HUVEpbxF8q3sTI4y+ul4dvZeQfbOq8pPMlqxLKsyM0/nueeeY9euXQDMmDGDhx9+GDDD7JzDmy5+ADz00EOcOnWKkJAQ5s6dm+FP1zOTCPj4+LB582YqVqzIVVddRUhICEFBQRQqVAh/f398fX3x8vLCy8sLh8Ph1jN411134efnR9GiRSlbtixffvml27U3bdrESy+9xEsvvcTPP//s2u/pxAgu/3ua2b8R8fHx/PvvvwDZNuz1xhtvZOfOna7EKCc452hdTnZV/3RK+TOaVu9bTEwMTZo0oUmTJnz66aeXvM7s2bPp0aMHPXr0uKIPRUQKOvUcieRT2VnQIKPXTvnJ9ciRI+nYsWOWX/vjjz92lQn29Nyn/CowMJCoqCgqVqzI3r170z13yZIltGnTBoDffvuNBg0apHv+tGnT6NGjB0CGh2aNGzeOadOmAdCvXz/atGlDgwYNWLZsGfv27ePJJ5/km2++cXvO+fPn6dSpEzt27MDX15fZs2dnathhZhL17FyMNuWww4s/0U9ISHD1zGSUt7e3W8/UlYiLi2PHjh0kJSURHx9PbGws0dHRHDt2jLZt27rmnn377bckJiYCpDuUMS4uLkvxZMew3fTkluQoZUKUVnKU8t85vaUUUpaDT9kWkYxRciSSTzn/c01MTEx3fY1z58652p5chwPcb0jLli3rkUpkKatn5fRNVF6VmcTg6NGjrnZm17LKyPCdOXPm0L9/fwD
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"pv_time = data[data['behavior_type'] == 'pv'].groupby('hour')['user_id'].count()\n",
"pv_time = pv_time.reset_index().rename(columns={'user_id': 'pv'})\n",
"uv_time = data.groupby('hour')['user_id'].apply(lambda x: x.drop_duplicates().count())\n",
"uv_time = uv_time.reset_index().rename(columns={'user_id': 'uv'})\n",
"x = pv_time['hour']\n",
"y1 = pv_time['pv']\n",
"y2 = uv_time['uv']\n",
"plt.figure(figsize=(10, 6))\n",
"plt.subplot(1, 1, 1)\n",
"plt.plot(x, y1, label='访问量', color='r', linewidth=1.8, marker='o', markersize=4)\n",
"plt.bar(x, y2, label='用户量')\n",
"plt.legend(loc='best')\n",
"plt.title(\"用户一天内各时间段活跃度\", fontsize=24)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "c38b5577",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>1</th>\n",
" <th>2268318</th>\n",
" <th>2520377</th>\n",
" <th>pv</th>\n",
" <th>1511544070</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2333346</td>\n",
" <td>2520771</td>\n",
" <td>pv</td>\n",
" <td>1511561733</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2576651</td>\n",
" <td>149192</td>\n",
" <td>pv</td>\n",
" <td>1511572885</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>3830808</td>\n",
" <td>4181361</td>\n",
" <td>pv</td>\n",
" <td>1511593493</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>4365585</td>\n",
" <td>2520377</td>\n",
" <td>pv</td>\n",
" <td>1511596146</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>4606018</td>\n",
" <td>2735466</td>\n",
" <td>pv</td>\n",
" <td>1511616481</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100150801</th>\n",
" <td>999999</td>\n",
" <td>4797808</td>\n",
" <td>11120</td>\n",
" <td>pv</td>\n",
" <td>1512293403</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100150802</th>\n",
" <td>999999</td>\n",
" <td>4613472</td>\n",
" <td>4602841</td>\n",
" <td>pv</td>\n",
" <td>1512293766</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100150803</th>\n",
" <td>999999</td>\n",
" <td>3647364</td>\n",
" <td>2304296</td>\n",
" <td>pv</td>\n",
" <td>1512293792</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100150804</th>\n",
" <td>999999</td>\n",
" <td>1903801</td>\n",
" <td>2304296</td>\n",
" <td>pv</td>\n",
" <td>1512293827</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100150805</th>\n",
" <td>999999</td>\n",
" <td>3696094</td>\n",
" <td>4602841</td>\n",
" <td>pv</td>\n",
" <td>1512293891</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>100150806 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" 1 2268318 2520377 pv 1511544070\n",
"0 1 2333346 2520771 pv 1511561733\n",
"1 1 2576651 149192 pv 1511572885\n",
"2 1 3830808 4181361 pv 1511593493\n",
"3 1 4365585 2520377 pv 1511596146\n",
"4 1 4606018 2735466 pv 1511616481\n",
"... ... ... ... .. ...\n",
"100150801 999999 4797808 11120 pv 1512293403\n",
"100150802 999999 4613472 4602841 pv 1512293766\n",
"100150803 999999 3647364 2304296 pv 1512293792\n",
"100150804 999999 1903801 2304296 pv 1512293827\n",
"100150805 999999 3696094 4602841 pv 1512293891\n",
"\n",
"[100150806 rows x 5 columns]"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd \n",
" \n",
"data = pd.read_csv('UserBehavior.csv')\n",
"# data.head(10)\n",
"# data.info()\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b178853a",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 20,
"id": "48d151ba",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['1', '2268318', '2520377', 'pv', '1511544070'], dtype='object')"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.rename(columns={'收货地址 ': '收货地址', '订单付款时间 ':'订单付款时间'}, inplace=True)\n",
"data.columns"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "770760ae",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"ename": "KeyError",
"evalue": "'订单付款时间'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)",
"File \u001b[1;32mc:\\Users\\Lenovo\\PycharmProjects\\pythonProject\\.venv\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:3805\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 3804\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m-> 3805\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 3806\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n",
"File \u001b[1;32mindex.pyx:167\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[1;34m()\u001b[0m\n",
"File \u001b[1;32mindex.pyx:196\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[1;34m()\u001b[0m\n",
"File \u001b[1;32mpandas\\\\_libs\\\\hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[1;34m()\u001b[0m\n",
"File \u001b[1;32mpandas\\\\_libs\\\\hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[1;34m()\u001b[0m\n",
"\u001b[1;31mKeyError\u001b[0m: '订单付款时间'",
"\nThe above exception was the direct cause of the following exception:\n",
"\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[21], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28msum\u001b[39m(\u001b[43mdata\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m订单付款时间\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39misnull())\n",
"File \u001b[1;32mc:\\Users\\Lenovo\\PycharmProjects\\pythonProject\\.venv\\Lib\\site-packages\\pandas\\core\\frame.py:4102\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 4100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[0;32m 4101\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[1;32m-> 4102\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 4103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[0;32m 4104\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n",
"File \u001b[1;32mc:\\Users\\Lenovo\\PycharmProjects\\pythonProject\\.venv\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:3812\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 3807\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[0;32m 3808\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[0;32m 3809\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[0;32m 3810\u001b[0m ):\n\u001b[0;32m 3811\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[1;32m-> 3812\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[0;32m 3813\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[0;32m 3814\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[0;32m 3815\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[0;32m 3816\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[0;32m 3817\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n",
"\u001b[1;31mKeyError\u001b[0m: '订单付款时间'"
]
}
],
"source": [
"sum(data['订单付款时间'].isnull())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d21acf79",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"print(data[data['订单付款时间'].isnull() & data['买家实际支付金额']>0].size) # 查看缺失值是否为拍下订单但是未付款情况\n",
"print(sum(data['订单付款时间'].isnull()) / data.shape[0]) # 查看缺失值与整体数据的比例"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d79e2277",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"data.duplicated().sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2551f804",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"data.describe()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "93fce0a0",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"plt.boxplot(data['总金额'])\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2a18faf8",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>订单编号</th>\n",
" <th>总金额</th>\n",
" <th>买家实际支付金额</th>\n",
" <th>收货地址</th>\n",
" <th>订单创建时间</th>\n",
" <th>订单付款时间</th>\n",
" <th>退款金额</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>19257</th>\n",
" <td>19258</td>\n",
" <td>188320.0</td>\n",
" <td>0.0</td>\n",
" <td>上海</td>\n",
" <td>2020-02-24 19:35:06</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 订单编号 总金额 买家实际支付金额 收货地址 订单创建时间 订单付款时间 退款金额\n",
"19257 19258 188320.0 0.0 上海 2020-02-24 19:35:06 NaN 0.0"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[data['总金额'] > 175000]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "81455de3",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"plt.boxplot(data['买家实际支付金额'])\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5594d619",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>订单编号</th>\n",
" <th>总金额</th>\n",
" <th>买家实际支付金额</th>\n",
" <th>收货地址</th>\n",
" <th>订单创建时间</th>\n",
" <th>订单付款时间</th>\n",
" <th>退款金额</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>3143</th>\n",
" <td>3144</td>\n",
" <td>11400.0</td>\n",
" <td>11400.0</td>\n",
" <td>江苏省</td>\n",
" <td>2020-02-18 09:34:43</td>\n",
" <td>2020-02-18 09:34:53</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13511</th>\n",
" <td>13512</td>\n",
" <td>16065.0</td>\n",
" <td>16065.0</td>\n",
" <td>内蒙古自治区</td>\n",
" <td>2020-02-26 15:41:27</td>\n",
" <td>2020-02-26 15:42:24</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 订单编号 总金额 买家实际支付金额 收货地址 订单创建时间 \\\n",
"3143 3144 11400.0 11400.0 江苏省 2020-02-18 09:34:43 \n",
"13511 13512 16065.0 16065.0 内蒙古自治区 2020-02-26 15:41:27 \n",
"\n",
" 订单付款时间 退款金额 \n",
"3143 2020-02-18 09:34:53 0.0 \n",
"13511 2020-02-26 15:42:24 0.0 "
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[data['买家实际支付金额'] > 6000]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9386ff31",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"plt.boxplot(data['退款金额'])\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "47144d28",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"data[data['退款金额'] > 2000]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a7040399",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"import numpy as np\n",
"np.sum(data['买家实际支付金额'])\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fb408ef4",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"data_area = data.groupby('收货地址').sum()['买家实际支付金额'].sort_values(ascending=False).reset_index()\n",
"\n",
"plt.figure(figsize=(20,8))\n",
"plt.bar(data_area['收货地址'], data_area['买家实际支付金额'],width=0.2)\n",
"plt.xlabel('')\n",
"plt.ylabel('销售额', rotation=0, labelpad=30, fontsize=15)\n",
"plt.title('各省市销售额情况', fontsize=20)\n",
"plt.xticks(rotation = 45)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f2a83226",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"plt.figure(figsize=(20,8),dpi=100)\n",
"plt.hist(data[data['总金额'] < 500]['总金额'])\n",
"plt.xticks(np.arange(0,500,step=25), fontsize=20)\n",
"plt.yticks(fontsize=20)\n",
"plt.xlabel('订单金额',fontsize=20)\n",
"plt.ylabel('订单数',fontsize=20, rotation=0, labelpad=40)\n",
"plt.title('订单金额分布情况', fontsize=25)\n",
"plt.show()\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}