import numpy as np import matplotlib.pyplot as plt import pandas as pd from sklearn import datasets import seaborn as sns iris = datasets.load_iris() print('iris.data的形状为:',iris.data.shape) print('iris.target的形状为:',iris.target.shape) print('iris.data的特征名称为:',iris.feature_names)
ax = sns.heatmap(iris.data)
ax = sns.heatmap(np.sort(iris.data))
ax = sns.heatmap(np.sort(iris.data,axis=0))
print('iris.data各特征的和为:',iris.data.sum(axis = 0)) print('iris.data各特征的均值为:',iris.data.mean(axis = 0)) print('iris.data各特征的标准差为:',iris.data.std(axis = 0)) print('iris.data各特征的方差为:',iris.data.var(axis = 0)) print('iris.data的最大值为:',np.max(iris.data)) print('iris.data的最小值为:',np.min(iris.data)) print('iris.data特征0的最大值索引为:',np.argmax(iris.data[:,0])) print('iris.data特征0的最小值索引为:',np.argmin(iris.data[:,0])) print('iris.data的最大值索引为:',np.argmax(iris.data)) print('iris.data的最小值索引为:',np.argmin(iris.data)) print('iris.data特征0的中位数为:',np.median(iris.data[:,0])) print('iris.data的中位数为:',np.median(iris.data)) print('iris.data特征0的极差为:',np.ptp(iris.data[:,0])) print('iris.data的极差为:',np.ptp(iris.data)) print('iris.data特征0与特征1的协方差为:\n',np.cov(iris.data[:,0],iris.data[:,1]))
df_iris=pd.DataFrame(np.hstack((iris.data,iris.target.reshape(-1,1))), columns=['f0','f1','f2','f3','species']) print('iris的形状为:',df_iris.shape) print('iris的列名为:',df_iris.columns) print('iris的前5行为:\n',df_iris.head())
print('iris特征0的最小值为:',df_iris['f0'].min()) print('iris特征0的最大值为:',df_iris['f0'].max()) print('iris特征0的均值为:',df_iris['f0'].mean()) print('iris特征0的极差为:',df_iris['f0'].kurt()) print('iris特征0的方差为:',df_iris['f0'].var()) print('iris特征0的标准差为:',df_iris['f0'].std())
print('iris的协方差矩阵为:\n',df_iris[['f0', 'f1', 'f2', 'f3']].cov())
print('iris数据表的相关系数为:\n',df_iris[['f0', 'f1', 'f2', 'f3']].corr())
print('iris特征0的标准误差为:',df_iris['f0'].sem()) print('iris特征0的众数为:\n',df_iris['f0'].mode()) print('iris特征0的样本偏度为:',df_iris['f0'].skew()) print('iris特征0的样本峰度为:',df_iris['f0'].kurt())
print('iris特征0的中位数为:',df_iris['f0'].median())
print('iris特征0的四分位数为:',df_iris['f0'].quantile(q=0.25)) print('iris特征0的非空值数目为:',df_iris['f0'].count()) print('iris特征0的平均绝对离差为:',df_iris['f0'].mad())
print('iris数据表的描述性统计为:\n',df_iris[['f0', 'f1', 'f2', 'f3']].describe())
df_iris['species']=df_iris['species'].astype('category') print('iris数据表的描述性统计为:\n', df_iris[['f0', 'f1', 'f2', 'f3','species']].describe())
print('iris数据表category类别species的描述性统计为:\n', df_iris['species'].describe())
print('iris品种的频数统计为:\n',df_iris['species'].value_counts())
print('iris特征0的频数统计为:\n',df_iris['f0'].value_counts())
k=5
f0,bins=pd.cut(df_iris['f0'],k,retbins=True) print('iris特征0分割成',k,'个等宽区间的分割点是:\n',bins)
print('iris特征0等宽离散化为5个区间后每个区间及其频数为:\n', f0.value_counts())
a_frequency=f0.value_counts() labels=a_frequency.index[0:k]
plt.rcParams['font.sans-serif'] = 'SimHei' plt.rc('font', size=14) plt.figure(figsize=(6,4)) plt.bar(range(k),a_frequency,width=0.5) plt.title('iris特征0等宽法频数统计图') plt.xlabel('iris特征0') plt.ylabel('频数') plt.xticks(range(k),labels,rotation=20) plt.show()
a_frequency=f0.value_counts(sort=False) labels=a_frequency.index[0:k] plt.figure(figsize=(6,4)) plt.bar(range(k),a_frequency,width=0.5) plt.title('iris特征0等宽法频数统计图') plt.xlabel('iris特征0') plt.ylabel('频数') plt.xticks(range(k),labels,rotation=20) plt.show()
def same_frequency_cut(data,k): w=data.quantile(np.arange(0,1+1.0/k,1.0/k)) data=pd.cut(data,w) return data a_frequency= same_frequency_cut(df_iris['f0'],5).value_counts() print('iris特征0等频法离散化后各个类别数目分布状况为:','\n',a_frequency)
a_frequency=same_frequency_cut(df_iris['f0'],5).value_counts(sort=False) labels=a_frequency.index[0:k] plt.figure(figsize=(6,4)) plt.bar(range(k),a_frequency,width=0.5) plt.title('iris特征0等频法频数统计图') plt.xlabel('iris特征0') plt.ylabel('频数') plt.xticks(range(k),labels,rotation=20) plt.show()
irisGroup = df_iris.groupby(by='species')
print('iris数据表按species分组后前5组每组的数量为:\n', irisGroup.count().head())
print('iris数据表按species分组后前5组每组的最大值为:\n', irisGroup.max().head())
print('iris数据表按species分组后前5组每组的最小值为:\n', irisGroup.min().head())
print('iris数据表按species分组后前5组每组的和为:\n', irisGroup.sum().head())
print('iris数据表按species分组后前5组每组的均值为:\n', irisGroup.mean().head())
print('iris数据表按species分组后前5组每组的标准差为:\n', irisGroup.std().head())
print('iris数据表按species分组后前5组每组的中位数为:\n', irisGroup.median().head())
print('iris数据表按species分组后前5组每组的大小为:\n', irisGroup.size().head())
print('iris数据表的特征0总和与特征1的总和与均值为:\n', df_iris.agg({'f0':np.sum,'f1':[np.mean,np.sum]}))
print('iris数据表分组后前5组每组的均值为:\n', irisGroup.agg(np.mean).head())
print('iris数据表分组后前5组每组的聚合结果为:\n', irisGroup.agg({'f0':np.sum,'f1':[np.mean,np.sum], 'f2':[np.mean,np.sum,np.std]}))
irisPivot = pd.pivot_table(df_iris[['f0','f1','f2','f3','species']], index = 'species')
print('iris数据表按species分组聚合后的组名为:\n',irisPivot.index)
print('iris数据表按species分组聚合后的组的数量为:',irisPivot.index.size)
print('iris数据表按species分组聚合后的各组平均值为:\n',irisPivot.values)
print('以species作为分组键创建的iris数据透视表为:\n', irisPivot.head())
irisPivot = pd.pivot_table(df_iris[['f0','f1','f2','f3','species']], index = ['f0','species'],aggfunc = np.sum) print('以f0和species作为分组键创建的iris数据透视表为:\n', irisPivot.head(10))
irisPivot = pd.pivot_table(df_iris[['f0','f1','f2','f3','species']], index = ['species','f0'], aggfunc = np.sum) print('以species和f0作为分组键创建的iris数据透视表为:\n', irisPivot.head(10))
irisPivot = pd.pivot_table(df_iris[['f0','f1','f2','f3','species']], index = 'species', columns='f0', aggfunc = np.sum) print('以species和f0作为分组键创建的iris数据透视表为:\n', irisPivot.head(10))
irisCross = pd.crosstab( index=df_iris['species'], columns=df_iris['f1'], values = df_iris['f0'],aggfunc = np.sum) print('以species和f1为分组键、f0为值的iris数据交叉透视表前10行10列为:\n', irisCross.iloc[:10,:10])
|