c042586
#!/bin/bash
c042586
#
c042586
# Bring up/down the kernel RDMA stack
c042586
#
c042586
# chkconfig: - 05 95
c042586
# description: Loads/Unloads InfiniBand and iWARP kernel modules
c042586
# config:	/etc/rdma/rdma.conf
c042586
#
c042586
### BEGIN INIT INFO
c042586
# Provides:       rdma
c042586
# Default-Stop: 0 1 2 3 4 5 6
c014a83
# Required-Start:
c014a83
# Required-Stop:
c042586
# Short-Description: Loads and unloads the InfiniBand and iWARP kernel modules
c042586
# Description: Loads and unloads the InfiniBand and iWARP kernel modules
c042586
### END INIT INFO
c042586
c042586
CONFIG=/etc/rdma/rdma.conf
c042586
c042586
. /etc/rc.d/init.d/functions
c042586
c042586
LOAD_ULP_MODULES=""
c042586
LOAD_CORE_USER_MODULES="ib_umad ib_uverbs ib_ucm rdma_ucm"
c042586
LOAD_CORE_CM_MODULES="iw_cm ib_cm rdma_cm"
c042586
LOAD_CORE_MODULES="ib_core ib_mad ib_sa ib_addr"
c042586
c042586
if [ -f $CONFIG ]; then
c042586
    . $CONFIG
c042586
65d850f
    if [ "${RDS_LOAD}" == "yes" ]; then
65d850f
        IPOIB_LOAD=yes
65d850f
    fi
65d850f
c042586
    if [ "${IPOIB_LOAD}" == "yes" ]; then
c042586
	LOAD_ULP_MODULES="ib_ipoib"
c042586
    fi
c042586
65d850f
    if [ "${RDS_LOAD}" == "yes" ]; then
65d850f
	LOAD_ULP_MODULES="$LOAD_ULP_MODULES rds"
65d850f
    fi
65d850f
c042586
    if [ "${SRP_LOAD}" == "yes" ]; then
c042586
	LOAD_ULP_MODULES="$LOAD_ULP_MODULES ib_srp"
c042586
    fi
c042586
c042586
    if [ "${ISER_LOAD}" == "yes" ]; then
c042586
	LOAD_ULP_MODULES="$LOAD_ULP_MODULES ib_iser"
c042586
    fi
c042586
else
c042586
    LOAD_ULP_MODULES="ib_ipoib"
c042586
fi
c042586
65d850f
UNLOAD_ULP_MODULES="ib_iser ib_srp rds ib_ipoib"
c042586
UNLOAD_HW_MODULES="iw_c2 iw_cxgb3 iw_nes ib_ehca ib_ipath ib_mthca mlx4_ib"
c042586
UNLOAD_CORE_USER_MODULES="rdma_ucm ib_ucm ib_uverbs ib_umad"
c042586
UNLOAD_CORE_CM_MODULES="rdma_cm ib_cm iw_cm"
c042586
UNLOAD_CORE_MODULES="ib_addr ib_sa ib_mad ib_core"
c042586
b53a362
interfaces=`ip link show | grep ": ib[0-9]*" | cut -f 2 -d ' ' | sed -e 'y/\r/ /;s/://'`
c042586
c042586
# If module $1 is loaded return - 0 else - 1
c042586
is_module()
c042586
{
c042586
    /sbin/lsmod | grep -w "$1" > /dev/null 2>&1
c042586
    return $?    
c042586
}
c042586
c042586
load_modules()
c042586
{
c042586
    local RC=0
c042586
c042586
    for module in $*; do
c042586
	if ! is_module $module; then
c042586
	    /sbin/modprobe $module
c042586
	    res=$?
c042586
	    RC=$[ $RC + $res ]
c042586
	    if [ $res -ne 0 ]; then
c042586
		echo
c042586
		echo -n "Failed to load module $mod"
c042586
	    fi
c042586
	fi
c042586
    done
c042586
    return $RC
c042586
}
c042586
c042586
unload_module()
c042586
{
c042586
    local mod=$1
c042586
    # Unload module $1
c042586
    if is_module $mod; then
c042586
	/sbin/rmmod $mod > /dev/null 2>&1
c042586
	if [ $? -ne 0 ]; then
c042586
	    echo
c042586
	    echo "Failed to unload $mod"
c042586
	    return 1
c042586
	fi
c042586
    fi
c042586
    return 0
c042586
}
c042586
c042586
# This function is a horrible hack to work around BIOS authors that should
c042586
# be shot.  Specifically, certain BIOSes will map the entire 4GB address
c042586
# space as write-back cacheable when the machine has 4GB or more of RAM, and
c042586
# then they will exclude the reserved PCI I/O addresses from that 4GB
c042586
# cacheable mapping by making on overlapping uncacheable mapping.  However,
c042586
# once you do that, it is then impossible to set *any* of the PCI I/O
c042586
# address space as write-combining.  This is an absolute death-knell to
c042586
# certain IB hardware.  So, we unroll this mapping here.  Instead of
c042586
# punching a hole in a single 4GB mapping, we redo the base 4GB mapping as
c042586
# a series of discreet mappings that effectively are the same as the 4GB
c042586
# mapping minus the hole, and then we delete the uncacheable mappings that
c042586
# are used to punch the hole.  This then leaves the PCI I/O address space
c042586
# unregistered (which defaults it to uncacheable), but available for
c042586
# write-combining mappings where needed.
c042586
check_mtrr_registers()
c042586
{
c042586
    # If we actually change the mtrr registers, then the awk script will
c042586
    # return true, and we need to unload the ib_ipath module if it's already
c042586
    # loaded.  The udevtrigger in load_hardware_modules will immediately
c042586
    # reload the ib_ipath module for us, so there shouldn't be a problem.
c042586
    [ -f /proc/mtrr -a -f /etc/rdma/fixup-mtrr.awk ] && 
c042586
	awk -f /etc/rdma/fixup-mtrr.awk /proc/mtrr 2>/dev/null &&
c042586
	if is_module ib_ipath; then
c042586
		/sbin/rmmod ib_ipath
c042586
	fi
c042586
}
c042586
c042586
load_hardware_modules()
c042586
{
c042586
    local -i RC=0
c042586
c042586
    [ "$FIXUP_MTRR_REGS" = "yes" ] && check_mtrr_registers
c042586
    # WARNING!!  If you are using this script to take down and bring up
c042586
    # your IB interfaces on a machine that uses more than one low level
c042586
    # Infiniband hardware driver, then there is no guarantee that the
c042586
    # ordering of rdma interfaces after you take down and bring up the
c042586
    # stack will be the same as the ordering of the interfaces on a
c042586
    # clean boot.
c042586
    #
c042586
    # We match both class NETWORK and class INFINIBAND devices since our
c042586
    # iWARP hardware is listed under class NETWORK.  The side effect of
c042586
    # this is that we might cause a non-iWARP network driver to be loaded.
abf19d7
    udevadm trigger --subsystem-match=pci --attr-nomatch=driver --attr-match=class=0x020000 --attr-match=class=0x0c0600
abf19d7
    udevadm settle
c042586
    if [ -r /proc/device-tree ]; then
c042586
	if [ -n "`ls /proc/device-tree | grep lhca`" ]; then
c042586
	    if ! is_module ib_ehca; then
c042586
		load_modules ib_ehca
c042586
		RC+=$?
c042586
	    fi
c042586
	fi
c042586
    fi
c042586
    if is_module cxgb3 -a ! is_module iw_cxgb3; then
c042586
	load_modules iw_cxgb3
c042586
	RC+=$?
c042586
    fi
c042586
    if is_module mlx4_core -a ! is_module mlx4_ib; then
c042586
	load_modules mlx4_ib
c042586
	RC+=$?
c042586
    fi
c042586
    return $RC
c042586
}
c042586
c042586
errata_58()
c042586
{
c042586
    # Check AMD chipset issue Errata #58
c042586
    if test -x /sbin/lspci && test -x /sbin/setpci; then
c042586
	if ( /sbin/lspci -nd 1022:1100 | grep "1100" > /dev/null ) &&
c042586
	   ( /sbin/lspci -nd 1022:7450 | grep "7450" > /dev/null ) &&
c042586
	   ( /sbin/lspci -nd 15b3:5a46 | grep "5a46" > /dev/null ); then
c042586
	    CURVAL=`/sbin/setpci -d 1022:1100 69`
c042586
	    for val in $CURVAL
c042586
	    do
c042586
		if [ "${val}" != "c0" ]; then
c042586
		    /sbin/setpci -d 1022:1100 69=c0
c042586
		    if [ $? -eq 0 ]; then
c042586
			break
c042586
		    else
c042586
			echo "Failed to apply AMD-8131 Errata #58 workaround"
c042586
		    fi
c042586
		fi
c042586
	    done
c042586
	fi
c042586
    fi
c042586
}
c042586
c042586
errata_56()
c042586
{
c042586
    # Check AMD chipset issue Errata #56
c042586
    if test -x /sbin/lspci && test -x /sbin/setpci; then
c042586
	if ( /sbin/lspci -nd 1022:1100 | grep "1100" > /dev/null ) &&
c042586
	   ( /sbin/lspci -nd 1022:7450 | grep "7450" > /dev/null ) &&
c042586
	   ( /sbin/lspci -nd 15b3:5a46 | grep "5a46" > /dev/null ); then
c042586
	    bus=""
c042586
	    # Look for devices AMD-8131
c042586
	    for dev in `/sbin/setpci -v -f -d 1022:7450 19 | cut -d':' -f1,2`
c042586
	    do
c042586
		bus=`/sbin/setpci -s $dev 19`
c042586
		rev=`/sbin/setpci -s $dev 8`
c042586
		# Look for Tavor attach to secondary bus of this devices
c042586
		for device in `/sbin/setpci -f -s $bus: -d 15b3:5a46 19`
c042586
		do
c042586
		    if [ $rev -lt 13 ]; then
c042586
			/sbin/setpci -d 15b3:5a44 72=14
c042586
			if [ $? -eq 0 ]; then
c042586
			    break
c042586
			else
c042586
			    echo
c042586
			    echo "Failed to apply AMD-8131 Errata #56 workaround"
c042586
			fi
c042586
		    else
c042586
			continue
c042586
		    fi
c042586
		    # If more than one device is on the bus the issue a
c042586
		    # warning
c042586
		    num=`/sbin/setpci -f -s $bus: 0 | wc -l |  sed 's/\ *//g'`
c042586
		    if [ $num -gt 1 ]; then
c042586
			echo "Warning: your current PCI-X configuration might be incorrect."
c042586
			echo "see AMD-8131 Errata 56 for more details."
c042586
		    fi
c042586
		done
c042586
	    done
c042586
	fi
c042586
    fi
c042586
}
c042586
c042586
start()
c042586
{
c042586
    local RC=0
c042586
    local loaded=0
c042586
c042586
    echo -n "Loading OpenIB kernel modules:"
c042586
c042586
    load_hardware_modules
c042586
    RC=$[ $RC + $? ]
c042586
    load_modules $LOAD_CORE_MODULES
c042586
    RC=$[ $RC + $? ]
c042586
    load_modules $LOAD_CORE_CM_MODULES
c042586
    RC=$[ $RC + $? ]
c042586
    load_modules $LOAD_CORE_USER_MODULES
c042586
    RC=$[ $RC + $? ]
c042586
    load_modules $LOAD_ULP_MODULES
c042586
    RC=$[ $RC + $? ]
c042586
   
c042586
    # Add node description to sysfs
c042586
    IBSYSDIR="/sys/class/infiniband"
c042586
    if [ -d ${IBSYSDIR} ]; then
c042586
	declare -i hca_id=1
c042586
	for hca in ${IBSYSDIR}/*
c042586
	do
c042586
	    if [ -w ${hca}/node_desc ]; then
c042586
	    	echo -n "$(hostname | cut -f 1 -d .) HCA-${hca_id}" >> ${hca}/node_desc 2> /dev/null
c042586
	    fi
c042586
	    let hca_id++
c042586
	done
c042586
    fi
c042586
   
c042586
    errata_58
c042586
    errata_56
c042586
    
c042586
    touch /var/lock/subsys/rdma
c042586
    [ $RC -eq 0 ] && echo_success || echo_failure
c042586
    echo
c042586
    return $RC    
c042586
}
c042586
c042586
stop()
c042586
{
c042586
    # Check if applications which use infiniband are running
c042586
    local apps="opensm osmtest srp_daemon"
c042586
    local pid
c042586
    local RC=0
c042586
    
c042586
    echo -n "Unloading OpenIB kernel modules:"
c042586
c042586
    for app in $apps
c042586
    do
c042586
    	if ( ps -ef | grep $app | grep -v grep > /dev/null 2>&1 ); then
c042586
	    echo
c042586
	    echo "Found $app running."
c042586
	    echo "Please stop all RDMA applications before downing the stack."
c042586
	    echo_failure
c042586
	    echo
c042586
	    return 1
c042586
	fi
c042586
    done
c042586
5632538
    if is_module svcrdma; then
5632538
    	echo "NFSoRDMA support is still enabled."
5632538
	echo "Please stop the nfs-rdma service before stopping the rdma service."
5632538
	echo_failure
5632538
	echo
5632538
	return 1
5632538
    fi
5632538
c042586
    if ! is_module ib_core; then
c042586
	# Nothing to do, make sure lock file is gone and return
c042586
	rm -f /var/lock/subsys/rdma
c042586
	echo_success
c042586
	echo
c042586
	return 0
c042586
    fi
c042586
c042586
    # Down all IPoIB interfaces
c042586
    if is_module ib_ipoib; then
c042586
	for i in $interfaces
c042586
	do
65d850f
	    . /etc/sysconfig/network-scripts/ifcfg-$i
65d850f
	    [ "${SLAVE}" = yes -a "${MASTER}" ] && ifdown ${MASTER} >/dev/null 2>&1
c042586
	    ifdown $i > /dev/null 2>&1
c042586
	done    
c042586
    fi
65d850f
    # Small sleep to let the ifdown settle before we remove any modules
65d850f
    sleep 1
c042586
	
c042586
    # Unload OpenIB modules
c042586
    MODULES="$UNLOAD_ULP_MODULES $UNLOAD_CORE_USER_MODULES"
c042586
    MODULES="$MODULES $UNLOAD_CORE_CM_MODULES"
c042586
    for mod in $MODULES
c042586
    do
c042586
	unload_module $mod
c042586
	RC=$[ $RC + $? ]
c042586
    done
c042586
    # Insert a sleep here for all the ULP modules to have been fully removed
c042586
    # before proceeding to unload the driver modules
c042586
    sleep 1
c042586
    MODULES="$UNLOAD_HW_MODULES $UNLOAD_CORE_MODULES" 
c042586
    for mod in $MODULES
c042586
    do
c042586
	unload_module $mod
c042586
	RC=$[ $RC + $? ]
c042586
    done
c042586
c042586
    rm -f /var/lock/subsys/rdma
c042586
    [ $RC -eq 0 ] && echo_success || echo_failure
c042586
    echo
c042586
    return $RC
c042586
}
c042586
c042586
status()
c042586
{
c042586
    local -i cnt=0
c042586
    local -i modules=0
c042586
    local module=""
c042586
c042586
    echo -ne "Low level hardware support loaded:\n\t"
c042586
    for module in $UNLOAD_HW_MODULES; do
c042586
	if is_module $module; then
c042586
	    echo -n "$module "
c042586
	    let cnt++
c042586
	fi
c042586
    done
c042586
    [ $cnt -eq 0 ] && echo -n "none found"
c042586
    modules+=cnt
c042586
    echo
c042586
    echo
c042586
c042586
    echo -ne "Upper layer protocol modules:\n\t"
c042586
    cnt=0
c042586
    for module in $UNLOAD_ULP_MODULES; do
c042586
	if is_module $module; then
c042586
	    echo -n "$module "
c042586
	    let cnt++
c042586
	fi
c042586
    done
c042586
    [ $cnt -eq 0 ] && echo -n "none found"
c042586
    modules+=cnt
c042586
    echo
c042586
    echo
c042586
c042586
    echo -ne "User space access modules:\n\t"
c042586
    cnt=0
c042586
    for module in $UNLOAD_CORE_USER_MODULES; do
c042586
	if is_module $module; then
c042586
	    echo -n "$module "
c042586
	    let cnt++
c042586
	fi
c042586
    done
c042586
    [ $cnt -eq 0 ] && echo -n "none found"
c042586
    modules+=cnt
c042586
    echo
c042586
    echo
c042586
c042586
    echo -ne "Connection management modules:\n\t"
c042586
    cnt=0
c042586
    for module in $UNLOAD_CORE_CM_MODULES; do
c042586
	if is_module $module; then
c042586
	    echo -n "$module "
c042586
	    let cnt++
c042586
	fi
c042586
    done
c042586
    [ $cnt -eq 0 ] && echo -n "none found"
c042586
    modules+=cnt
c042586
    echo
c042586
    echo
c042586
c042586
    for module in $UNLOAD_CORE_MODULES; do
c042586
	if is_module $module; then
c042586
	    let modules++
c042586
	fi
c042586
    done
c042586
c042586
    if is_module ib_ipoib; then
c042586
	echo -n "Configured IPoIB interfaces: "
c042586
	cnt=0
c042586
	for i in /etc/sysconfig/network-scripts/ifcfg-ib*
c042586
	do
c042586
	    if [ -f $i ]; then
c042586
		. $i
c042586
		echo -n "$DEVICE "
c042586
		let cnt++
c042586
	    fi
c042586
	done
c042586
	[ $cnt -eq 0 ] && echo -n "none"
c042586
	echo
c042586
	echo -n "Currently active IPoIB interfaces: "
c042586
	cnt=0
c042586
	for i in $interfaces
c042586
	do
b53a362
	    ip link show $i | grep -w UP > /dev/null 2>&1
c042586
	    [ $? -eq 0 ] && echo -n "$i " && let cnt++
c042586
	done
c042586
	[ $cnt -eq 0 ] && echo -n "none"
c042586
	echo
c042586
    fi
c042586
    
c042586
    if [ $modules -eq 0 ]; then
c042586
	if [ -f /var/lock/subsys/rdma ]; then
c042586
	    return 2
c042586
	else
c042586
	    return 3
c042586
	fi
c042586
    else
c042586
	return 0
c042586
    fi
c042586
}
c042586
c042586
restart ()
c042586
{
c042586
    stop
c042586
    start
c042586
}
c042586
c042586
condrestart ()
c042586
{
c042586
    [ -e /var/lock/subsys/rdma ] && restart || return 0
c042586
}
c042586
c042586
usage ()
c042586
{
c042586
    echo
c042586
    echo "Usage: `basename $0` {start|stop|restart|condrestart|try-restart|force-reload|status}"
c042586
    echo
c042586
    return 2
c042586
}
c042586
c042586
case $1 in
65d850f
    start|stop|restart|condrestart|try-restart|force-reload)
65d850f
	[ `id -u` != "0" ] && exit 4 ;;
65d850f
esac
65d850f
65d850f
case $1 in
c042586
    start) start; RC=$? ;;
c042586
    stop) stop; RC=$? ;;
c042586
    restart) restart; RC=$? ;;
c042586
    reload) RC=3 ;;
c042586
    condrestart) condrestart; RC=$? ;;
c042586
    try-restart) condrestart; RC=$? ;;
c042586
    force-reload) condrestart; RC=$? ;;
c042586
    status) status; RC=$? ;;
c042586
    *) usage; RC=$? ;;
c042586
esac
c042586
c042586
exit $RC