Blob Blame History Raw
#!/bin/bash
#
# Bring up the kernel RDMA stack
#
# This is usually run automatically by systemd after a hardware activation
# event in udev has triggered a start of the rdma.service unit
#

shopt -s nullglob

CONFIG=/etc/rdma/rdma.conf
MTRR_SCRIPT=/usr/libexec/rdma-fixup-mtrr.awk

LOAD_ULP_MODULES=""
LOAD_CORE_USER_MODULES="ib_umad ib_uverbs ib_ucm rdma_ucm"
LOAD_CORE_CM_MODULES="iw_cm ib_cm rdma_cm"
LOAD_CORE_MODULES="ib_core ib_mad ib_sa ib_addr"

if [ -f $CONFIG ]; then
    . $CONFIG

    if [ "${RDS_LOAD}" == "yes" ]; then
        IPOIB_LOAD=yes
    fi

    if [ "${IPOIB_LOAD}" == "yes" ]; then
	LOAD_ULP_MODULES="ib_ipoib"
    fi

    if [ "${RDS_LOAD}" == "yes" -a -f /lib/modules/`uname -r`/kernel/net/rds/rds.ko ]; then
	LOAD_ULP_MODULES="$LOAD_ULP_MODULES rds"
	if [ -f /lib/modules/`uname -r`/kernel/net/rds/rds_tcp.ko ]; then
	    LOAD_ULP_MODULES="$LOAD_ULP_MODULES rds_tcp"
	fi
	if [ -f /lib/modules/`uname -r`/kernel/net/rds/rds_rdma.ko ]; then
	    LOAD_ULP_MODULES="$LOAD_ULP_MODULES rds_rdma"
	fi
    fi

    if [ "${SRP_LOAD}" == "yes" ]; then
	LOAD_ULP_MODULES="$LOAD_ULP_MODULES ib_srp"
    fi

    if [ "${SRPT_LOAD}" == "yes" ]; then
	LOAD_ULP_MODULES="$LOAD_ULP_MODULES ib_srpt"
    fi

    if [ "${ISER_LOAD}" == "yes" ]; then
	LOAD_ULP_MODULES="$LOAD_ULP_MODULES ib_iser"
    fi

    if [ "${ISERT_LOAD}" == "yes" ]; then
	LOAD_ULP_MODULES="$LOAD_ULP_MODULES ib_isert"
    fi
else
    LOAD_ULP_MODULES="ib_ipoib"
fi

# If module $1 is loaded return - 0 else - 1
is_module()
{
    /sbin/lsmod | grep -w "$1" > /dev/null 2>&1
    return $?
}

load_modules()
{
    local RC=0

    for module in $*; do
	if ! is_module $module; then
	    /sbin/modprobe $module
	    res=$?
	    RC=$[ $RC + $res ]
	    if [ $res -ne 0 ]; then
		echo
		echo -n "Failed to load module $mod"
	    fi
	fi
    done
    return $RC
}

# This function is a horrible hack to work around BIOS authors that should
# be shot.  Specifically, certain BIOSes will map the entire 4GB address
# space as write-back cacheable when the machine has 4GB or more of RAM, and
# then they will exclude the reserved PCI I/O addresses from that 4GB
# cacheable mapping by making on overlapping uncacheable mapping.  However,
# once you do that, it is then impossible to set *any* of the PCI I/O
# address space as write-combining.  This is an absolute death-knell to
# certain IB hardware.  So, we unroll this mapping here.  Instead of
# punching a hole in a single 4GB mapping, we redo the base 4GB mapping as
# a series of discreet mappings that effectively are the same as the 4GB
# mapping minus the hole, and then we delete the uncacheable mappings that
# are used to punch the hole.  This then leaves the PCI I/O address space
# unregistered (which defaults it to uncacheable), but available for
# write-combining mappings where needed.
check_mtrr_registers()
{
    # If we actually change the mtrr registers, then the awk script will
    # return true, and we need to unload the ib_ipath module if it's already
    # loaded.  The udevtrigger in load_hardware_modules will immediately
    # reload the ib_ipath module for us, so there shouldn't be a problem.
    [ -f /proc/mtrr -a -f $MTRR_SCRIPT ] &&
	awk -f $MTRR_SCRIPT /proc/mtrr 2>/dev/null &&
	if is_module ib_ipath; then
		/sbin/rmmod ib_ipath
	fi
}

load_hardware_modules()
{
    local -i RC=0

    [ "$FIXUP_MTRR_REGS" = "yes" ] && check_mtrr_registers
    # We match both class NETWORK and class INFINIBAND devices since our
    # iWARP hardware is listed under class NETWORK.  The side effect of
    # this is that we might cause a non-iWARP network driver to be loaded.
    udevadm trigger --subsystem-match=pci --attr-nomatch=driver --attr-match=class=0x020000 --attr-match=class=0x0c0600
    udevadm settle
    if [ -r /proc/device-tree ]; then
	if [ -n "`ls /proc/device-tree | grep lhca`" ]; then
	    if ! is_module ib_ehca; then
		load_modules ib_ehca
		RC+=$?
	    fi
	fi
    fi
    if is_module be2net -a ! is_module ocrdma; then
	load_modules ocrdma
	RC+=$?
    fi
    return $RC
}

errata_58()
{
    # Check AMD chipset issue Errata #58
    if test -x /sbin/lspci && test -x /sbin/setpci; then
	if ( /sbin/lspci -nd 1022:1100 | grep "1100" > /dev/null ) &&
	   ( /sbin/lspci -nd 1022:7450 | grep "7450" > /dev/null ) &&
	   ( /sbin/lspci -nd 15b3:5a46 | grep "5a46" > /dev/null ); then
	    CURVAL=`/sbin/setpci -d 1022:1100 69`
	    for val in $CURVAL
	    do
		if [ "${val}" != "c0" ]; then
		    /sbin/setpci -d 1022:1100 69=c0
		    if [ $? -eq 0 ]; then
			break
		    else
			echo "Failed to apply AMD-8131 Errata #58 workaround"
		    fi
		fi
	    done
	fi
    fi
}

errata_56()
{
    # Check AMD chipset issue Errata #56
    if test -x /sbin/lspci && test -x /sbin/setpci; then
	if ( /sbin/lspci -nd 1022:1100 | grep "1100" > /dev/null ) &&
	   ( /sbin/lspci -nd 1022:7450 | grep "7450" > /dev/null ) &&
	   ( /sbin/lspci -nd 15b3:5a46 | grep "5a46" > /dev/null ); then
	    bus=""
	    # Look for devices AMD-8131
	    for dev in `/sbin/setpci -v -f -d 1022:7450 19 | cut -d':' -f1,2`
	    do
		bus=`/sbin/setpci -s $dev 19`
		rev=`/sbin/setpci -s $dev 8`
		# Look for Tavor attach to secondary bus of this devices
		for device in `/sbin/setpci -f -s $bus: -d 15b3:5a46 19`
		do
		    if [ $rev -lt 13 ]; then
			/sbin/setpci -d 15b3:5a44 72=14
			if [ $? -eq 0 ]; then
			    break
			else
			    echo
			    echo "Failed to apply AMD-8131 Errata #56 workaround"
			fi
		    else
			continue
		    fi
		    # If more than one device is on the bus the issue a
		    # warning
		    num=`/sbin/setpci -f -s $bus: 0 | wc -l |  sed 's/\ *//g'`
		    if [ $num -gt 1 ]; then
			echo "Warning: your current PCI-X configuration might be incorrect."
			echo "see AMD-8131 Errata 56 for more details."
		    fi
		done
	    done
	fi
    fi
}

load_hardware_modules
RC=$[ $RC + $? ]
load_modules $LOAD_CORE_MODULES
RC=$[ $RC + $? ]
load_modules $LOAD_CORE_CM_MODULES
RC=$[ $RC + $? ]
load_modules $LOAD_CORE_USER_MODULES
RC=$[ $RC + $? ]
load_modules $LOAD_ULP_MODULES
RC=$[ $RC + $? ]

errata_58
errata_56

exit $RC