数据分析库：numpy

numpy介绍

我们知道，python的数组可以用来存储任意对象，所以对于一个数组[1，2，3]，需要有3个指针和3个整数对象，所以python在效率上惨不忍睹，既浪费内存，也浪费时间。

所以numpy应运而生，numpy提供了两种基本的对象，其中核心是ndarray(N-dimensional array object)，它是用来存储单一数据的多维数组。ndarray是一种数据结构，本身内核是C代码，变成运行速度很快。

array的常用变量

dtype变量，用来存放数据类型，创建数组时可以同时指定
shape变量，存放数组的大小。
reshape方法，创建一个改变形状的数组，与原数组的内存共享。

创建矩阵

直接创建矩阵

使用列表、元组或者嵌套列表来创建矩阵。

import numpy as np
a = np.array([1, 2, 3, 4])
b = np.array((5, 6, 7, 8))
c = np.array([[1, 2, 3, 4], [4, 5, 6, 7], [7, 8, 9, 10]])
print (a)
print ('---')
print (b)
print ('---')
print (c)
# 输出
#[1 2 3 4]
#---
#[5 6 7 8]
#---
#[[ 1  2  3  4]
# [ 4  5  6  7]
# [ 7  8  9 10]]

创建全0矩阵

import numpy as np
a = np.zeros((3, 4)) # 创建3行4列的0矩阵
b = np.zeros((3, 4), dtype=np.str) # 可以在创建的时候指定数据类型
print(a)
print(a.dtype)
print('---')
print(b)
print(b.dtype)

创建全1矩阵

import numpy as np
a = np.ones((3, 4), dtype = int) # 创建3行4列的1矩阵
print(a)
print(a.dtype)
# 输出
#[[1 1 1 1]
# [1 1 1 1]
# [1 1 1 1]]
#int32

np.arange

按照等差创建矩阵

import numpy as np

a = np.arange(10, 30, 5) # 10开始到30，没加5生成一个元素
print(a)
# 输出
#[10 15 20 25]
# 可以通过修改shape属性改变维度，参考上文
b = np.arange(0, 2, 0.3) # 0开始到2，没加0.3生成一个元素
print(b)
# 输出
#[ 0.   0.3  0.6  0.9  1.2  1.5  1.8]
c = np.arange(12).reshape(3, 4) # 从0开始每加1共生成12个元素，并通过reshape设定矩阵大小为3行4列
print(c)
# 输出
#[[ 0  1  2  3]
# [ 4  5  6  7]
# [ 8  9 10 11]]
d = np.random.random((2, 3)) # 生成2行3列矩阵，元素为0-1之间的随机值
print(d)
# 输出
#[[ 0.83492169  0.76747417  0.3277655 ]
# [ 0.99115563  0.32029091  0.69754825]]

np.linspace

按照元素个数取值

import numpy as np
from numpy import pi
print(np.linspace(0, 2*pi, 11)) # 0到2*pi，取11个值
#输出
#[ 0.          0.62831853  1.25663706  1.88495559  2.51327412  3.14159265
#  3.76991118  4.39822972  5.02654825  5.65486678  6.28318531]
print(np.linspace(0, 10, 11)) # 0到10，取11个值
#输出
#[  0.   1.   2.   3.   4.   5.   6.   7.   8.   9.  10.]

np.random

创建0-1的随机数组

import numpy as np
a = np.random.random(5)
print(a)
#[ 0.01449445  0.61745786  0.47911107  0.80746168  0.48032829]
b = np.random.random([2,3])
print(b)
#[[ 0.00194012  0.6861311   0.06081057]
# [ 0.1238706   0.48659479  0.76274877]]

np.vstack

按行拼接矩阵

import numpy as np

a = np.array([1, 2, 3])
b = np.array([4, 5, 6])
c = np.array([7, 8, 9])

result = np.vstack((a, b, c))
print(result)

np.hstack

按列拼接矩阵

import numpy as np

a = np.floor(10*np.random.random((2, 2)))
b = np.floor(10*np.random.random((2, 2)))

print (a)
print ('---')
print (b)
print ('---')
print (np.vstack((a, b))) # 按行拼接，也就是竖方向拼接
print ('---')
print (np.hstack((a, b))) # 按列拼接，也就是横方向拼接
#输出：
#[[ 9.  4.]
# [ 4.  4.]]
#---
#[[ 8.  3.]
# [ 9.  8.]]
#---
#[[ 9.  4.]
# [ 4.  4.]
# [ 8.  3.]
# [ 9.  8.]]
#---
#[[ 9.  4.  8.  3.]
# [ 4.  4.  9.  8.]]

np.hsplit

按列分割矩阵

import numpy as np

a = np.floor(10*np.random.random((2, 6)))

print (a)
print (np.hsplit(a, 3)) # 按列分割，也就是横方向分割，参数a为要分割的矩阵，参数3为分成三份
print ('---')
print (np.hsplit(a, (2, 3, 5))) # 参数(3, 4)为在维度3前面也就是第4列前切一下，在维度4也就是第5列前面切一下
# 输出
#[[ 2.  9.  4.  6.  1.  9.]
# [ 7.  1.  7.  9.  3.  5.]]
#[array([[ 2.,  9.],
#       [ 7.,  1.]]), array([[ 4.,  6.],
#       [ 7.,  9.]]), array([[ 1.,  9.],
#       [ 3.,  5.]])]
#---
#[array([[ 2.,  9.],
#       [ 7.,  1.]]), array([[ 4.],
#       [ 7.]]), array([[ 6.,  1.],
#       [ 9.,  3.]]), array([[ 9.],
#       [ 5.]])]

np.vsplit

按行分割矩阵

import numpy as np

a = np.floor(10*np.random.random((6, 2)))

print (a)
print (np.vsplit(a, 3)) # 按列分割，也就是横方向分割，参数a为要分割的矩阵，参数3为分成三份
print ('---')
print (np.vsplit(a, (2, 3, 5))) # 参数(3, 4)为在维度3前面也就是第4列前切一下，在维度4也就是第5列前面切一下
# 输出
#[[ 4.  3.]
# [ 9.  1.]
# [ 0.  0.]
# [ 8.  8.]
# [ 0.  2.]
# [ 5.  0.]]
#[array([[ 4.,  3.],
#       [ 9.,  1.]]), array([[ 0.,  0.],
#       [ 8.,  8.]]), array([[ 0.,  2.],
#       [ 5.,  0.]])]
#---
#[array([[ 4.,  3.],
#       [ 9.,  1.]]), array([[ 0.,  0.]]), array([[ 8.,  8.],
#       [ 0.,  2.]]), array([[ 5.,  0.]])]

np.tile

多重复制tile

import numpy as np
a = np.array([5,10,15])
print(a)
print('---')
b = np.tile(a, (4,3))# 参数(4, 3)为按行复制4倍，按列复制3倍
print(b)
print(b.shape)
print(type(b))
#输出
#[ 5 10 15]
#---
#[[ 5 10 15  5 10 15  5 10 15]
# [ 5 10 15  5 10 15  5 10 15]
# [ 5 10 15  5 10 15  5 10 15]
# [ 5 10 15  5 10 15  5 10 15]]
#(4, 9)
#<class 'numpy.ndarray'>

np.reshape

改变矩阵的形状

import numpy as np

a = np.array([[1, 2, 3, 4], [4, 5, 6, 7], [7, 8, 9, 10]])
print(a)
print('---')
b = a.reshape((4, -1))
#b.shape = -1, 3
#b.shape = (4, 3)
#b.shape = 4, 3
print('b = ', b)
print (b.shape)
print('a = ', a)
# 输出
#[[ 1  2  3  4]
# [ 4  5  6  7]
# [ 7  8  9 10]]
#---
#b =  [[ 1  2  3]
# [ 4  4  5]
# [ 6  7  7]
# [ 8  9 10]]
#(4, 3)
#a =  [[ 1  2  3  4]
# [ 4  5  6  7]
# [ 7  8  9 10]]

基本操作和运算

sum、max、min、mean

import numpy as np
test1 = np.array([[5, 10, 15],
                  [20, 25, 30],
                  [35, 40, 45]])
print(test1.sum())
# 输出 225
print(test1.max())
# 输出 45
print(test1.min())
# 输出 5
print(test1.mean())
# 输出 25.0

矩阵行求和sum(axis=1)

import numpy as np
test1 = np.array([[5, 10, 15],
                  [20, 25, 30],
                  [35, 40, 45]])
print(test1.sum(axis=1))
# 输出 array([30, 75, 120])

矩阵列求和sum(axis=0)

import numpy as np
test1 = np.array([[5, 10, 15],
                  [20, 25, 30],
                  [35, 40, 45]])
peint(test1.sum(axis=0))
# 输出 array([60, 75, 90])

求平方

import numpy as np
a = np.arange(4)
print (a)
print (a**2)
# 输出 [0 1 2 3]
#      [0 1 4 9]

np.exp、np.sqrt

import numpy as np
test = np.arange(3)
print (test)
print (np.exp(test)) #e的n次幂
print (np.sqrt(test)) #开根号
# 输出 [0 1 2]
#      [1. 2.71828183 7.3890561]
#      [0 1. 1.41421356]

向下取整floor

import numpy as np

testRandom = 10*np.random.random((2,3))
testFloor = np.floor(testRandom)

print(testRandom)
print (testFloor)

# 输出 [[ 4.1739405   3.61074364  0.96858834]
#       [ 4.83959291  8.786262    0.74719657]]
#      [[ 4.  3.  0.]
#       [ 4.  8.  0.]]

数据类型

array数组中的数据类型有下面几种

bool -- True , False
int -- int16 , int32 , int64
float -- float16 , float32 , float64
string -- string , unicode

创建时指定元素类型

import numpy as np
a = np.array([[1, 2, 3, 4], [4, 5, 6, 7], [7, 8, 9, 10]])
b = np.array([[1, 2, 3, 4], [4, 5, 6, 7], [7, 8, 9, 10]], dtype='str')
print (a)
print("a.dtype = ", a.dtype)
print ('---')
print (b)
print("b.dtype = ", b.dtype)
# 输出
#[[ 1  2  3  4]
# [ 4  5  6  7]
# [ 7  8  9 10]]
#a.dtype =  int32
#---
#[['1' '2' '3' '4']
# ['4' '5' '6' '7']
# ['7' '8' '9' '10']]
#b.dtype =  <U2 ???????这是什么鬼？为什么不是str？

转换数据类型

import numpy as np
b = np.array([[1, 2, 3, 4], [4, 5, 6, 7], [7, 8, 9, 10]], dtype='str')
print (b)
print("b.dtype = ", b.dtype)
b = b.astype(int)
print (b)
print("b.dtype = ", b.dtype)
# 输出
#[['1' '2' '3' '4']
# ['4' '5' '6' '7']
# ['7' '8' '9' '10']]
#b.dtype =  <U2
#[[ 1  2  3  4]
# [ 4  5  6  7]
# [ 7  8  9 10]]
#b.dtype =  int32

矩阵运算

矩阵乘法dot

import numpy as np
a = np.array([[1, 2],
              [3, 4]])
b = np.array([[5, 6],
              [7, 8]])
print (a*b) # 对应位置元素相乘
print (a.dot(b)) # 矩阵乘法
print (np.dot(a, b)) # 矩阵乘法，同上
# 输出 [[5 12]
#       [21 32]]
#      [[19 22]
#       [43 50]]
#      [[19 22]
#       [43 50]]

矩阵转置T/transpose

import numpy as np

test = np.array([[2,3,4],[5,6,7]])
test.shape = (3, -1)

print(test)
print(test.T)
print(test.transpose())
# 输出 [[2 3]
#       [4 5]
#       [6 7]]
#     [[2 4 6]
#      [3 5 7]]
#[[2 4 6]
# [3 5 7]]

求特征值和特征向量eig

import numpy as np

# 创建一个二维数组
a = np.array([[1, 2], [3, 4]])

# 求解特征值和特征向量
eigenvalues, eigenvectors = np.linalg.eig(a)

# 打印特征值和特征向量
print("特征值：", eigenvalues)
print("特征向量：", eigenvectors)

求矩阵的迹trace()

import numpy as np

test = np.array([[2,3,4],[5,6,7],[8,9,10]])

print(test)
print(test.trace())
print(np.trace(test))
# 输出
#[[ 2  3  4]
# [ 5  6  7]
# [ 8  9 10]]
#18
#18

复制

共享内存

a和b共享数据存储内存区域，因此修改其中任意一个数组的元素都会同时修改另外一个数组或矩阵的内容：

import numpy as np
a = np.arange(12)
b = a

print (a)
print (b)
print (b is a) # 判断b是a？
# 输出 [ 0  1  2  3  4  5  6  7  8  9 10 11]
#    [ 0  1  2  3  4  5  6  7  8  9 10 11]
#    True
b.shape = 3, 4
b[0,0] = 100;
print (a.shape)
# 输出 (3, 4)
print(a)
# 输出[[100   1   2   3]
# [  4   5   6   7]
# [  8   9  10  11]]
print (id(a))#内存地址
print (id(b))
# 输出 201372576
#      201372576

浅复制view()

不是同一地址，但是修改元素会改变

# The view method creates a new array object that looks at the same data.

import numpy as np
a = np.arange(12)
b = a.view() # b是新创建出来的数组，但是b和a共享数据

print(b is a) # 判断b是a？
# 输出 False
print (b)
# 输出 [ 0  1  2  3  4  5  6  7  8  9 10 11]
b.shape = 2, 6 # 改变b的shape，a的shape不会受影响
print (a.shape)
print (b)
# 输出 (12,)
#[[ 0  1  2  3  4  5]
# [ 6  7  8  9 10 11]]
b[0, 4] = 1234 # 改变b第1行第5列元素为1234，a对应位置元素受到影响
print (b)
# 输出 [[   0    1    2    3 1234    5]
#         [   6    7    8    9   10   11]]
print (a)
# 输出 [   0    1    2    3 1234    5    6    7    8    9   10   11]

深复制copy()

不是同一地址，也不会被改变

# The copy method makes a complete copy of the array and its data.

import numpy as np
a = np.arange(12)
a.shape = 3, 4
a[1, 0] = 1234

c = a.copy()
print(c is a)
c[0, 0] = 9999 # 改变c元素的值，不会影响a的元素
print (c)
print (a)
# 输出
#False
#[[9999    1    2    3]
# [1234    5    6    7]
# [   8    9   10   11]]
#[[   0    1    2    3]
# [1234    5    6    7]
# [   8    9   10   11]]

查询维度ndim

import numpy as np
a = np.array([[5, 10, 15],
       [20, 25, 30],
       [35, 40, 45]])
print(a.ndim)
#输出
#2

查询元素的个数size

import numpy as np
a = np.array([[5, 10, 15],
       [20, 25, 30],
       [35, 40, 45]])
print(a.size)
# 输出 9

查询矩阵的大小

import numpy as np
a = np.array([1, 2, 3, 4])
b = np.array([[1, 2, 3, 4], [4, 5, 6, 7], [7, 8, 9, 10]])
print (a.shape)
print ('---')
print (b.shape)
# 输出
#(4,)
#---
#(3, 4)

判断==

判断是否存在某个值

import numpy as np
test = np.array([5,10,15,20])
print(test >= 15)
#[False False  True  True]
print(test*(test >= 15) + 15)
#[15 15 30 35]

排序

import numpy as np
a = np.array([[4,3,5,],[1,2,1]])
print (a)
#[[4 3 5]
# [1 2 1]]
b = np.sort(a, axis=1) # 对a按每行中元素从小到大排序
print(b)
#[[3 4 5]
# [1 1 2]]
c = a.copy()# 深拷贝
c.sort(axis=0)
print(c)
#[[1 2 1]
# [4 3 5]]
print(a)# a没有随着c变化

数据归一化

import numpy as np
a= 10*np.random.random((5,5))
b = a.copy()
print(a)
print("---")
amin, amax = a.min(), a.max()
print(amin, amax)
print("---")
a = (a-amin)/(amax-amin)
print(a)
print("---")
b = (b-b.min())/(b.max() - b.min())
print(b)
# 输出
#[[ 7.33412218  7.62784714  7.06761515  6.56230239  3.76404535]
# [ 2.68197834  6.02335055  4.67169946  5.08454875  6.97170333]
# [ 4.02393841  3.9723266   1.82841784  7.6049149   0.38845819]
# [ 6.55672442  1.40986757  1.14657213  3.0356768   9.55024583]
# [ 1.06007416  1.23600072  0.97610622  8.8232397   0.39996053]]
#---
#0.38845818848 9.5502458299
#---
#[[ 0.75811231  0.7901721   0.72902333  0.67386895  0.36844198]
# [ 0.25033544  0.61504289  0.46751152  0.51257361  0.71855465]
# [ 0.39680905  0.39117567  0.15717016  0.78766907  0.        ]
# [ 0.67326012  0.11148582  0.08274738  0.28894128  1.        ]
# [ 0.07330622  0.09250842  0.0641412   0.920648    0.00125547]]
#---
#[[ 0.75811231  0.7901721   0.72902333  0.67386895  0.36844198]
# [ 0.25033544  0.61504289  0.46751152  0.51257361  0.71855465]
# [ 0.39680905  0.39117567  0.15717016  0.78766907  0.        ]
# [ 0.67326012  0.11148582  0.08274738  0.28894128  1.        ]
# [ 0.07330622  0.09250842  0.0641412   0.920648    0.00125547]]

numpy类型

类型	类型代码	说明
int8、uint8	i1、u1	有符号和无符号8位整型（1字节）
int16、uint16	i2、u2	有符号和无符号16位整型（2字节）
int32、uint32	i4、u4	有符号和无符号32位整型（4字节）
int64、uint64	i8、u8	有符号和无符号64位整型（8字节）
float16	f2	半精度浮点数
float32	f4、f	单精度浮点数
float64	f8、d	双精度浮点数
float128	f16、g	扩展精度浮点数
complex64	c8	分别用两个32位表示的复数
complex128	c16	分别用两个64位表示的复数
complex256	c32	分别用两个128位表示的复数
bool	?	布尔型
object	O	python对象
string	Sn	固定长度字符串，每个字符1字节，如S10
unicode	Un	固定长度Unicode，字节数由系统决定，如U10

Categories:python库

Tags:python库

huahai2022