#!/bin/bash # # Bring up the kernel RDMA stack # # This is usually run automatically by systemd after a hardware activation # event in udev has triggered a start of the rdma.service unit # shopt -s nullglob CONFIG=/etc/rdma/rdma.conf MTRR_SCRIPT=/usr/libexec/rdma-fixup-mtrr.awk LOAD_ULP_MODULES="" LOAD_CORE_USER_MODULES="ib_umad ib_uverbs ib_ucm rdma_ucm" LOAD_CORE_CM_MODULES="iw_cm ib_cm rdma_cm" LOAD_CORE_MODULES="ib_core ib_mad ib_sa ib_addr" if [ -f $CONFIG ]; then . $CONFIG if [ "${RDS_LOAD}" == "yes" ]; then IPOIB_LOAD=yes fi if [ "${IPOIB_LOAD}" == "yes" ]; then LOAD_ULP_MODULES="ib_ipoib" fi if [ "${RDS_LOAD}" == "yes" -a -f /lib/modules/`uname -r`/kernel/net/rds/rds.ko ]; then LOAD_ULP_MODULES="$LOAD_ULP_MODULES rds" if [ -f /lib/modules/`uname -r`/kernel/net/rds/rds_tcp.ko ]; then LOAD_ULP_MODULES="$LOAD_ULP_MODULES rds_tcp" fi if [ -f /lib/modules/`uname -r`/kernel/net/rds/rds_rdma.ko ]; then LOAD_ULP_MODULES="$LOAD_ULP_MODULES rds_rdma" fi fi if [ "${SRP_LOAD}" == "yes" ]; then LOAD_ULP_MODULES="$LOAD_ULP_MODULES ib_srp" fi if [ "${SRPT_LOAD}" == "yes" ]; then LOAD_ULP_MODULES="$LOAD_ULP_MODULES ib_srpt" fi if [ "${ISER_LOAD}" == "yes" ]; then LOAD_ULP_MODULES="$LOAD_ULP_MODULES ib_iser" fi if [ "${ISERT_LOAD}" == "yes" ]; then LOAD_ULP_MODULES="$LOAD_ULP_MODULES ib_isert" fi else LOAD_ULP_MODULES="ib_ipoib" fi # If module $1 is loaded return - 0 else - 1 is_module() { /sbin/lsmod | grep -w "$1" > /dev/null 2>&1 return $? } load_modules() { local RC=0 for module in $*; do if ! is_module $module; then /sbin/modprobe $module res=$? RC=$[ $RC + $res ] if [ $res -ne 0 ]; then echo echo -n "Failed to load module $mod" fi fi done return $RC } # This function is a horrible hack to work around BIOS authors that should # be shot. Specifically, certain BIOSes will map the entire 4GB address # space as write-back cacheable when the machine has 4GB or more of RAM, and # then they will exclude the reserved PCI I/O addresses from that 4GB # cacheable mapping by making on overlapping uncacheable mapping. However, # once you do that, it is then impossible to set *any* of the PCI I/O # address space as write-combining. This is an absolute death-knell to # certain IB hardware. So, we unroll this mapping here. Instead of # punching a hole in a single 4GB mapping, we redo the base 4GB mapping as # a series of discreet mappings that effectively are the same as the 4GB # mapping minus the hole, and then we delete the uncacheable mappings that # are used to punch the hole. This then leaves the PCI I/O address space # unregistered (which defaults it to uncacheable), but available for # write-combining mappings where needed. check_mtrr_registers() { # If we actually change the mtrr registers, then the awk script will # return true, and we need to unload the ib_ipath module if it's already # loaded. The udevtrigger in load_hardware_modules will immediately # reload the ib_ipath module for us, so there shouldn't be a problem. [ -f /proc/mtrr -a -f $MTRR_SCRIPT ] && awk -f $MTRR_SCRIPT /proc/mtrr 2>/dev/null && if is_module ib_ipath; then /sbin/rmmod ib_ipath fi } load_hardware_modules() { local -i RC=0 [ "$FIXUP_MTRR_REGS" = "yes" ] && check_mtrr_registers # We match both class NETWORK and class INFINIBAND devices since our # iWARP hardware is listed under class NETWORK. The side effect of # this is that we might cause a non-iWARP network driver to be loaded. udevadm trigger --subsystem-match=pci --attr-nomatch=driver --attr-match=class=0x020000 --attr-match=class=0x0c0600 udevadm settle if [ -r /proc/device-tree ]; then if [ -n "`ls /proc/device-tree | grep lhca`" ]; then if ! is_module ib_ehca; then load_modules ib_ehca RC+=$? fi fi fi if is_module be2net -a ! is_module ocrdma; then load_modules ocrdma RC+=$? fi return $RC } errata_58() { # Check AMD chipset issue Errata #58 if test -x /sbin/lspci && test -x /sbin/setpci; then if ( /sbin/lspci -nd 1022:1100 | grep "1100" > /dev/null ) && ( /sbin/lspci -nd 1022:7450 | grep "7450" > /dev/null ) && ( /sbin/lspci -nd 15b3:5a46 | grep "5a46" > /dev/null ); then CURVAL=`/sbin/setpci -d 1022:1100 69` for val in $CURVAL do if [ "${val}" != "c0" ]; then /sbin/setpci -d 1022:1100 69=c0 if [ $? -eq 0 ]; then break else echo "Failed to apply AMD-8131 Errata #58 workaround" fi fi done fi fi } errata_56() { # Check AMD chipset issue Errata #56 if test -x /sbin/lspci && test -x /sbin/setpci; then if ( /sbin/lspci -nd 1022:1100 | grep "1100" > /dev/null ) && ( /sbin/lspci -nd 1022:7450 | grep "7450" > /dev/null ) && ( /sbin/lspci -nd 15b3:5a46 | grep "5a46" > /dev/null ); then bus="" # Look for devices AMD-8131 for dev in `/sbin/setpci -v -f -d 1022:7450 19 | cut -d':' -f1,2` do bus=`/sbin/setpci -s $dev 19` rev=`/sbin/setpci -s $dev 8` # Look for Tavor attach to secondary bus of this devices for device in `/sbin/setpci -f -s $bus: -d 15b3:5a46 19` do if [ $rev -lt 13 ]; then /sbin/setpci -d 15b3:5a44 72=14 if [ $? -eq 0 ]; then break else echo echo "Failed to apply AMD-8131 Errata #56 workaround" fi else continue fi # If more than one device is on the bus the issue a # warning num=`/sbin/setpci -f -s $bus: 0 | wc -l | sed 's/\ *//g'` if [ $num -gt 1 ]; then echo "Warning: your current PCI-X configuration might be incorrect." echo "see AMD-8131 Errata 56 for more details." fi done done fi fi } load_hardware_modules RC=$[ $RC + $? ] load_modules $LOAD_CORE_MODULES RC=$[ $RC + $? ] load_modules $LOAD_CORE_CM_MODULES RC=$[ $RC + $? ] load_modules $LOAD_CORE_USER_MODULES RC=$[ $RC + $? ] load_modules $LOAD_ULP_MODULES RC=$[ $RC + $? ] errata_58 errata_56 exit $RC