You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
P2P/2021142108 林俊聪.py

91 lines
3.8 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import pandas as pd
import numpy as np
print("2021142108 林俊聪")
def dropnullstd(data):
print("删除前的的形状为:", data.shape)
before = data.shape[1]
colisNull = data.describe().loc["count"] == 0
for i in range(len(colisNull)):
if colisNull[i]:
data.drop(labels=colisNull.index[i], axis=1, inplace=True)
stdisZero = data.describe().loc["std"] == 0
for j in range(len(stdisZero)):
if stdisZero[j]:
data.drop(labels=stdisZero.index[j], axis=1, inplace=True)
after = data.shape[1]
print("删除后数据的形状为:", data.shape)
def transformdayintominute(data):
for i in range(0, len(data)):
data[i] = data[i].total_seconds()/60
return data
def transformdayintohour(data):
for i in range(0, len(data)):
data[i] = data[i].total_seconds() / 3600
return data
# 1.读取并查查看P2P网络带宽数据主表Training_Master的基本信息
data = pd.read_csv("Training_Master.csv")
print("主表的维度为:", data.ndim)
print("主表的形状为:", data.shape)
print("主表的内存为:", data.memory_usage())
# describe方法
print("主表的描述性统计为:", data.describe())
# 删除值相同或者全空的列
dropnullstd(data)
# 2.提取用户信息更新表Training_Userupdate和登录信息表Training_LogInfo的时间信息
data2 = pd.read_csv("Training_Userupdate.csv")
data3 = pd.read_csv("Training_LogInfo.csv")
# 使用to_datetime函数转换用户信息更新表和登录信息表的时间字符串。
data2["ListingInfo1"] = pd.to_datetime(data2["ListingInfo1"])
data2["UserupdateInfo2"] = pd.to_datetime(data2["UserupdateInfo2"])
data3["Listinginfo1"] = pd.to_datetime(data3["Listinginfo1"])
data3["LogInfo3"] = pd.to_datetime(data3["LogInfo3"])
# 使用year、 month、week等方法提取用户信息更新表和登录信息表中的时间信息
year = [i.year for i in data2["ListingInfo1"].head()]
print("ListingInfo1中的前5个年份信息", year[:5])
month = [i.month for i in data2["ListingInfo1"].head()]
print("ListingInfo1中的前5个月份信息", month[:5])
week = [i.week for i in data2["ListingInfo1"].head()]
print("ListingInfo1中的前5个星期信息", week[:5])
day = [i.day for i in data2["ListingInfo1"].head()]
print("ListingInfo1中的前5个日期信息", day[:5])
#计算用户信息更新表和登录信息表中两时间的差,分别以日、小时、分钟计算
timeDeltaUserupdate=data2["ListingInfo1"] - data2["UserupdateInfo2"]
print("计算时间差以日期为单位:\n", timeDeltaUserupdate.head())
print("计算时间差以小时为单位:\n", transformdayintohour(timeDeltaUserupdate).head())
timeDeltaUserupdate=data2["ListingInfo1"] - data2["UserupdateInfo2"]
print("计算时间差以分钟为单位:\n", transformdayintominute(timeDeltaUserupdate).head())
# 3.使用分组聚合方法进一步分析用户信息更新表和登录信息表
# 使用groupby方法对用户信息更新表和登录信息表进行分组
UserupdateGroup = data2[["Idx", "UserupdateInfo2"]].groupby(by="Idx")
LogInfoGroup = data3[["Idx", "LogInfo3"]].groupby(by="Idx")
print('分组后的用户信息更新表为:', UserupdateGroup.head())
print('分组后的登录信息表为:', LogInfoGroup.head())
# 使用agg方法求取分组后的最早和最晚更新及登录时间
print("分组后最早更新时间:", UserupdateGroup.agg(np.min).head())
print("分组后最晚更新时间:", UserupdateGroup.agg(np.max).head())
# 使用size方法求取分组后的数据的信息更新次数与登录次数
print("分组后信息更新次数:", UserupdateGroup.size().head())
print("分组后登录次数:", LogInfoGroup.size().head())