ceph与spdk的部署以及性能测试

裸盘 io

1
2
3
4
5
6
7
sync && echo 3 | sudo tee /proc/sys/vm/drop_caches

dd if=/dev/zero of=/dev/nvme0n1 bs=1M count=64

sync && echo 3 | sudo tee /proc/sys/vm/drop_caches

dd if=/dev/nvme0n1 of=/dev/null bs=1M count=40960

结果

读速率 写速率
1 1.7 GB/s 1.5 GB/s
2 1.7 GB/s 1.5 GB/s
3 1.7 GB/s 1.5 GB/s
4 1.7 GB/s 1.5 GB/s
5 1.7 GB/s 1.5 GB/s
avg 1.7 GB/s 1.5GB/s
1
ps -e -o %cpu,rss,cmd | grep xxx

spdk nvme-of

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# 进入spdk目录
$ cd spdk

# 运行脚本转让设备控制权给SPDK
$ sudo scripts/setup.sh
0000:03:00.0 (15ad 07f0): nvme -> uio_pci_generic

# 查看设备状态
$ sudo scripts/setup.sh status
Hugepages
node hugesize free / total
node0 1048576kB 0 / 0
node0 2048kB 1024 / 1024

Type BDF Vendor Device NUMA Driver Device Block devices
NVMe 0000:01:00.0 126f 2263 0 uio_pci_generic - -
NVMe 0000:02:00.0 10ec 5765 0 nvme nvme1 nvme1n1

#启动 nvmf-tgf
$ sudo build/bin/nvmf_tgt

#创建TCP传输
$ sudo scripts/rpc.py nvmf_create_transport -t TCP -u 16384 -m 8 -c 8192

#创建 NVMe 块设备
$ sudo scripts/rpc.py bdev_nvme_attach_controller -b NVMe1 -t PCIe -a 0000:01:00.0


#将NVMe块设备分配给子系统
# 创建子系统
$ sudo scripts/rpc.py nvmf_create_subsystem nqn.2016-06.io.spdk:cnode1 -a -s SPDK00000000000001 -d SPDK_Controller1

# 将NVMe块设备分配给刚刚创建的子系统
$ sudo scripts/rpc.py nvmf_subsystem_add_ns nqn.2016-06.io.spdk:cnode1 NVMe1n1

# 为子系统添加TCP监听器
$ sudo scripts/rpc.py nvmf_subsystem_add_listener nqn.2016-06.io.spdk:cnode1 -t TCP -a 192.168.12.85 -s 4420



#然后客户端这边
#安装 nvme-cli
$ apt install nvme-cli

#加载驱动程序
$ sudo modprobe nvme-tcp

#发现 Target
$ nvme discover -t tcp -a 192.168.12.85 -s 4420

Discovery Log Number of Records 1, Generation counter 1
=====Discovery Log Entry 0======
trtype: tcp
adrfam: ipv4
subtype: nvme subsystem
treq: not required
portid: 0
trsvcid: 4420
subnqn: nqn.2016-06.io.spdk:cnode1
traddr: 192.168.12.85
sectype: none
#连接Target
$ nvme connect -t tcp -n "nqn.2016-06.io.spdk:cnode1" -a 192.168.12.85 -s 4420
#查看设备 最下边显示 SPDK_Controller1 的就是 然后
$ nvme list
Node SN Model Namespace Usage Format FW Rev
---------------- -------------------- ---------------------------------------- --------- -------------------------- ---------------- --------
/dev/nvme0n1 FXS500231001140 Fanxiang S500 512GB 1 512.11 GB / 512.11 GB 512 B + 0 B V0808A0
/dev/nvme1n1 2022122303702 Colorful CN600 500GB 1 500.11 GB / 500.11 GB 512 B + 0 B VC2S038D
/dev/nvme2n1 SPDK00000000000001 SPDK_Controller1 1 512.11 GB / 512.11 GB 512 B + 0 B 23.05

#tartget端可可以通过这种方式测试cpu和内存占用
$ scripts/spdk_top.py -s 192.168.12.85 -p 4420

#做完测试
nvme disconnect-all



#
1
2
3
4
5
6
7
sync && echo 3 | sudo tee /proc/sys/vm/drop_caches

dd if=/dev/zero of=/dev/nvme2n1 bs=1M count=40960

sync && echo 3 | sudo tee /proc/sys/vm/drop_caches

dd if=/dev/nvme2n1 of=/dev/null bs=1M count=40960

结果

读速率 写速率
1 112 MB/s 97.4MB/s
2 112 MB/s 103 MB/s
3 112 MB/s 97.1MB/s
4 112 MB/s 98.8MB/s
5 112 MB/s 100 MB/s
avg 112 MB/s 99.26 MB/s

使用iperf3工具测试两端的tcp传输速率 约为 114MB/s

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
Connecting to host 192.168.12.85, port 5201
[ 5] local 192.168.12.78 port 54836 connected to 192.168.12.85 port 5201
[ ID] Interval Transfer Bitrate Retr Cwnd
[ 5] 0.00-1.00 sec 110 MBytes 920 Mbits/sec 41 260 KBytes
[ 5] 1.00-2.00 sec 109 MBytes 915 Mbits/sec 38 284 KBytes
[ 5] 2.00-3.00 sec 109 MBytes 916 Mbits/sec 35 258 KBytes
[ 5] 3.00-4.00 sec 109 MBytes 918 Mbits/sec 25 209 KBytes
[ 5] 4.00-5.00 sec 109 MBytes 915 Mbits/sec 27 258 KBytes
[ 5] 5.00-6.00 sec 109 MBytes 915 Mbits/sec 33 260 KBytes
[ 5] 6.00-7.00 sec 109 MBytes 915 Mbits/sec 28 201 KBytes
[ 5] 7.00-8.00 sec 109 MBytes 915 Mbits/sec 32 260 KBytes
[ 5] 8.00-9.00 sec 109 MBytes 914 Mbits/sec 32 279 KBytes
[ 5] 9.00-10.00 sec 109 MBytes 917 Mbits/sec 25 263 KBytes
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval Transfer Bitrate Retr
[ 5] 0.00-10.00 sec 1.07 GBytes 916 Mbits/sec 316 sender
[ 5] 0.00-10.04 sec 1.07 GBytes 912 Mbits/sec receiver

结论tcp带宽已占满,瓶颈在于网络带宽

单台机器上的结果

读速率 写速率 写内存占用 写cpu占用 读内存占用 读cpu占用
1 1.6 GB/s 1.5 GB/s
2 1.6 GB/s 1.5 GB/s
3 1.6 GB/s 1.5 GB/s
4 1.6 GB/s 1.5 GB/s
5 1.6 GB/s 1.5 GB/s
avg 1.6 GB/s 1.5 GB/s 12832KB 99.8% 12832KB 99.8%

ceph

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#每个节点设置时钟同步
$ systemctl start chronyd.service
$ systemctl enable chronyd.service

#每个节点关闭防火墙
$ systemctl stop firewalld.service
$ systemctl stop iptables,service
$ systemctl disable firewalld.service
$ systemctl disable iptables.service

#安装
apt install cephadm ceph-deply

#部署
mkdir ceph-cluster
cd ceph-cluster/
ceph-deploy new mon-node1

#修改ceph.conf 添加
public_network = 192.168.12.0/24
cluster_network = 192.168.12.0/24

#每个节点都安装一些东西
apt install ceph radosgw

#激活monitor
ceph-deploy mon create-initial

#把配置文件与admin密钥拷贝到 所有节点
ceph-deploy admin mon-node1 osd-node0 osd-node1

#激活mgr
ceph-deploy mgr create mon-node1

#查看状态
ceph -s
cluster:
id: d4c3e01a-b2af-4587-98ce-51cf01e2883d
health: HEALTH_WARN
mon is allowing insecure global_id reclaim
OSD count 0 < osd_pool_default_size 3

services:
mon: 1 daemons, quorum mon-node1 (age 5m)
mgr: mon-node1(active, since 61s)
osd: 0 osds: 0 up, 0 in

data:
pools: 0 pools, 0 pgs
objects: 0 objects, 0 B
usage: 0 B used, 0 B / 0 B avail
pgs:
#执行下列命令可以消除第一个警告
ceph config set mon mon_warn_on_insecure_global_id_reclaim true
ceph config set mon mon_warn_on_insecure_global_id_reclaim_allowed true
ceph config set mon auth_allow_insecure_global_id_reclaim false

#添加osd
ceph-deploy disk zap mon-node1 /dev/nvme0n1
ceph-deploy disk zap osd-node0 /dev/nvme0n1
ceph-deploy disk zap osd-node1 /dev/nvme0n1

ceph-deploy osd create mon-node1 --data /dev/nvme0n1
ceph-deploy osd create osd-node0 --data /dev/nvme0n1
ceph-deploy osd create osd-node1 --data /dev/nvme0n1

#再次查看状态
ceph -s
cluster:
id: d4c3e01a-b2af-4587-98ce-51cf01e2883d
health: HEALTH_OK

services:
mon: 1 daemons, quorum mon-node1 (age 20m)
mgr: mon-node1(active, since 16m)
osd: 3 osds: 3 up (since 33s), 3 in (since 33s)

task status:

data:
pools: 1 pools, 1 pgs
objects: 0 objects, 0 B
usage: 3.0 GiB used, 1.4 TiB / 1.4 TiB avail
pgs: 1 active+clean

#对外提供块存储服务
#
ceph osd pool create cephrbd 128 128
rbd create image -s 10G --image-feature layering -p cephrbd

#客户端这边
rbd map cephrbd/image
#输出如下 这就是模拟出来的块设备 对其进行性能测试及资源使用测试
/dev/rbd0


1
2
3
4
5
6
7
sync && echo 3 | sudo tee /proc/sys/vm/drop_caches

dd if=/dev/zero of=/dev/rbd0 bs=1M count=4096

sync && echo 3 | sudo tee /proc/sys/vm/drop_caches

dd if=/dev/rbd0 of=/dev/null bs=1M count=4096

读写速率

读速率 写速率
1 143 MB/s 65.3 MB/s
2 144 MB/s 70.7 MB/s
3 144 MB/s 69.7 MB/s
4 144 MB/s 72.5 MB/s
5 144 MB/s 73.3 MB/s
avg 143.8 MB/s 70.3MB/s

资源占用情况

ps -e -o %cpu,rss,cmd | grep ceph

空载cpu 空载内存 读cpu 读内存 写cpu 写内存
ceph-mon 0.4% 88096 0.4% 138132 0.4% 110284
ceph-mgr 0.2% 197760 0.2% 199076 0.2% 198812
ceph-osd 0.3% 69956 3.4% 594784 3.1% 353572

后面发现似乎好像和空载负载并没有关系 一直维持在 cpu 0.4% 0.2% 3.7% 内存也是几乎不变 的水平了

合理推测应该是和数据量有关系 数据存的越多越杂才会有变化

只对一个RBD块设备反复做dd可能就是会没什么变化 但是 dd之前和dd之后的变化 倒是可以参考出来