[toc]
sge高性能集群的搭建与使用
集群环境的准备
Node1(master) | CentOS7.4 | iptables/selinux(off) | IP:10.180.66.11 | hostname:node1 | ali yum源 |
---|---|---|---|---|---|
Node2(slave) | CentOS7.4 | iptables/selinux(off) | IP:10.180.66.12 | hostname:node2 | ali yum源 |
Node3(slave) | CentOS7.4 | iptables/selinux(off) | IP:10.180.66.13 | hostname:node3 | ali yum源 |
Node4(slave) | CentOS7.4 | iptables/selinux(off) | IP:10.180.66.14 | hostname:node4 | ali yum源 |
Node5(slave) | CentOS7.4 | iptables/selinux(off) | IP:10.180.66.15 | hostname:node5 | ali yum源 |
master 节点安装
安装相关依赖包
# yum -y install jemalloc-devel openssl-devel ncurses-devel pam-devel libXmu-devel hwloc-devel hwloc hwloc-libs java-devel javacc ant-junit libdb-devel motif-devel csh ksh xterm db4-utils perl-XML-Simple perl-Env xorg-x11-fonts-ISO8859-1-100dpi xorg-x11-fonts-ISO8859-1-75dpi
新建sge管理员用户
# groupadd -g 490 sgeadmin
# useradd -u 495 -g 490 -r -m -d /home/sgeadmin -s /bin/bash -c "SGE Admin" sgeadmin
# sed -i '/^%wheel/a\%sgeadmin ALL=(ALL) NOPASSWD: ALL' /etc/sudoers
安装sge
sge 链接 密码:c7hy
# cd /usr/local/src/
# tar -xvf ge2011.11.tar.gz
# mkdir -pv /data
# cp -a ge2011.11 /data/sge
# chown sgeadmin.sgeadmin /data/sge
qmaster 安装自动回答脚本,依赖软件包expect
, 所有节点都需要安装
# cd /data/sge/
# vim master.sh
#!/bin/bash
user="sgeadmin"
/usr/bin/expect <<-EOF
spawn ./install_qmaster
expect "*>>"
send "
"
expect "*>>"
send "y
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "n
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect eof
EOF
# sh master.sh
修改主节点环境变量
# export SGE_ROOT=/data/sge
# echo 'export SGE_ROOT=/data/sge' >> ~/.bashrc
# echo 'PATH=$PATH:/data/sge/bin/linux-x64/:/data/sge/bin/' >> ~/.bashrc
# cp /data/sge/default/common/settings.sh /etc/profile.d/
# sh /etc/profile.d/settings.sh
# source /etc/profile
添加节点
# qconf -ah node1
# qconf -ah node2
# qconf -ah node3
# qconf -ah node4
# qconf -ah node5
master 服务器搭建 nfs 服务
所有节点都需要安装
# yum -y install nfs-utils
master节点操作
# vim /etc/exports
/data/sge 10.180.66.0/24(rw,sync)
# systemctl restart nfs
slave 节点挂载
(node2,node3,node4,node5)执行
# mkdir /data/sge -pv
# mount -t nfs 10.180.66.11:/data/sge /data/sge/
# chown sgeadmin.sgeadmin /data/
slave 服务器安装sgeexecd
(node2,node3,node4,node5) 执行
# yum -y install hwloc-devel
# useradd -u 495 -g 490 -r -m -d /home/sgeadmin -s /bin/bash -c "SGE Admin" sgeadmin
# sed -i '/^%wheel/a\%sgeadmin ALL=(ALL) NOPASSWD: ALL' /etc/sudoers
生效环境变量
# echo 'export SGE_ROOT=/data/sge' >> ~/.bashrc
# echo 'PATH=$PATH:/data/sge/bin/linux-x64/:/data/sge/bin/' >> ~/.bashrc
# echo 'export SGE_CELL=default' >> ~/.bashrc
# cp /data/sge/default/common/settings.sh /etc/profile.d/ -a
# source ~/.bashrc
# source /etc/profile
进行安装,所有节点都执行此脚本
# vim slave.sh
# cat slave.sh
#!/bin/bash
user="sgeadmin"
/usr/bin/expect <<-EOF
spawn ./install_execd
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect eof
EOF
# sh slave.sh
完成集群搭建
# qhost
HOSTNAME ARCH NCPU LOAD MEMTOT MEMUSE SWAPTO SWAPUS
-------------------------------------------------------------------------------
global - - - - - - -
node1 linux-x64 1 0.01 968.3M 193.0M 2.0G 64.0K
node2 linux-x64 1 0.01 976.3M 151.0M 2.0G 0.0
node3 linux-x64 1 0.02 978.3M 152.2M 2.0G 84.0K
node4 linux-x64 1 0.02 976.3M 155.4M 2.0G 0.0
node5 linux-x64 1 0.01 978.3M 148.5M 2.0G 84.0K