#!/bin/bash
#
# Bring up the kernel RDMA stack
#
# This is usually run automatically by systemd after a hardware activation
# event in udev has triggered a start of the rdma.service unit
#
CONFIG=/etc/rdma/rdma.conf
MTRR_SCRIPT=/usr/sbin/rdma-fixup-mtrr.awk
LOAD_ULP_MODULES=""
LOAD_CORE_USER_MODULES="ib_umad ib_uverbs ib_ucm rdma_ucm"
LOAD_CORE_CM_MODULES="iw_cm ib_cm rdma_cm"
LOAD_CORE_MODULES="ib_core ib_mad ib_sa ib_addr"
if [ -f $CONFIG ]; then
. $CONFIG
if [ "${RDS_LOAD}" == "yes" ]; then
IPOIB_LOAD=yes
fi
if [ "${IPOIB_LOAD}" == "yes" ]; then
LOAD_ULP_MODULES="ib_ipoib"
fi
if [ "${RDS_LOAD}" == "yes" ]; then
LOAD_ULP_MODULES="$LOAD_ULP_MODULES rds"
fi
if [ "${SRP_LOAD}" == "yes" ]; then
LOAD_ULP_MODULES="$LOAD_ULP_MODULES ib_srp"
fi
if [ "${ISER_LOAD}" == "yes" ]; then
LOAD_ULP_MODULES="$LOAD_ULP_MODULES ib_iser"
fi
else
LOAD_ULP_MODULES="ib_ipoib"
fi
# If module $1 is loaded return - 0 else - 1
is_module()
{
/sbin/lsmod | grep -w "$1" > /dev/null 2>&1
return $?
}
load_modules()
{
local RC=0
for module in $*; do
if ! is_module $module; then
/sbin/modprobe $module
res=$?
RC=$[ $RC + $res ]
if [ $res -ne 0 ]; then
echo
echo -n "Failed to load module $mod"
fi
fi
done
return $RC
}
# This function is a horrible hack to work around BIOS authors that should
# be shot. Specifically, certain BIOSes will map the entire 4GB address
# space as write-back cacheable when the machine has 4GB or more of RAM, and
# then they will exclude the reserved PCI I/O addresses from that 4GB
# cacheable mapping by making on overlapping uncacheable mapping. However,
# once you do that, it is then impossible to set *any* of the PCI I/O
# address space as write-combining. This is an absolute death-knell to
# certain IB hardware. So, we unroll this mapping here. Instead of
# punching a hole in a single 4GB mapping, we redo the base 4GB mapping as
# a series of discreet mappings that effectively are the same as the 4GB
# mapping minus the hole, and then we delete the uncacheable mappings that
# are used to punch the hole. This then leaves the PCI I/O address space
# unregistered (which defaults it to uncacheable), but available for
# write-combining mappings where needed.
check_mtrr_registers()
{
# If we actually change the mtrr registers, then the awk script will
# return true, and we need to unload the ib_ipath module if it's already
# loaded. The udevtrigger in load_hardware_modules will immediately
# reload the ib_ipath module for us, so there shouldn't be a problem.
[ -f /proc/mtrr -a -f $MTRR_SCRIPT ] &&
awk -f $MTRR_SCRIPT /proc/mtrr 2>/dev/null &&
if is_module ib_ipath; then
/sbin/rmmod ib_ipath
fi
}
load_hardware_modules()
{
local -i RC=0
[ "$FIXUP_MTRR_REGS" = "yes" ] && check_mtrr_registers
# We match both class NETWORK and class INFINIBAND devices since our
# iWARP hardware is listed under class NETWORK. The side effect of
# this is that we might cause a non-iWARP network driver to be loaded.
udevadm trigger --subsystem-match=pci --attr-nomatch=driver --attr-match=class=0x020000 --attr-match=class=0x0c0600
udevadm settle
if [ -r /proc/device-tree ]; then
if [ -n "`ls /proc/device-tree | grep lhca`" ]; then
if ! is_module ib_ehca; then
load_modules ib_ehca
RC+=$?
fi
fi
fi
if is_module cxgb3 -a ! is_module iw_cxgb3; then
load_modules iw_cxgb3
RC+=$?
fi
if is_module mlx4_core -a ! is_module mlx4_ib; then
load_modules mlx4_ib
RC+=$?
fi
return $RC
}
errata_58()
{
# Check AMD chipset issue Errata #58
if test -x /sbin/lspci && test -x /sbin/setpci; then
if ( /sbin/lspci -nd 1022:1100 | grep "1100" > /dev/null ) &&
( /sbin/lspci -nd 1022:7450 | grep "7450" > /dev/null ) &&
( /sbin/lspci -nd 15b3:5a46 | grep "5a46" > /dev/null ); then
CURVAL=`/sbin/setpci -d 1022:1100 69`
for val in $CURVAL
do
if [ "${val}" != "c0" ]; then
/sbin/setpci -d 1022:1100 69=c0
if [ $? -eq 0 ]; then
break
else
echo "Failed to apply AMD-8131 Errata #58 workaround"
fi
fi
done
fi
fi
}
errata_56()
{
# Check AMD chipset issue Errata #56
if test -x /sbin/lspci && test -x /sbin/setpci; then
if ( /sbin/lspci -nd 1022:1100 | grep "1100" > /dev/null ) &&
( /sbin/lspci -nd 1022:7450 | grep "7450" > /dev/null ) &&
( /sbin/lspci -nd 15b3:5a46 | grep "5a46" > /dev/null ); then
bus=""
# Look for devices AMD-8131
for dev in `/sbin/setpci -v -f -d 1022:7450 19 | cut -d':' -f1,2`
do
bus=`/sbin/setpci -s $dev 19`
rev=`/sbin/setpci -s $dev 8`
# Look for Tavor attach to secondary bus of this devices
for device in `/sbin/setpci -f -s $bus: -d 15b3:5a46 19`
do
if [ $rev -lt 13 ]; then
/sbin/setpci -d 15b3:5a44 72=14
if [ $? -eq 0 ]; then
break
else
echo
echo "Failed to apply AMD-8131 Errata #56 workaround"
fi
else
continue
fi
# If more than one device is on the bus the issue a
# warning
num=`/sbin/setpci -f -s $bus: 0 | wc -l | sed 's/\ *//g'`
if [ $num -gt 1 ]; then
echo "Warning: your current PCI-X configuration might be incorrect."
echo "see AMD-8131 Errata 56 for more details."
fi
done
done
fi
fi
}
load_hardware_modules
RC=$[ $RC + $? ]
load_modules $LOAD_CORE_MODULES
RC=$[ $RC + $? ]
load_modules $LOAD_CORE_CM_MODULES
RC=$[ $RC + $? ]
load_modules $LOAD_CORE_USER_MODULES
RC=$[ $RC + $? ]
load_modules $LOAD_ULP_MODULES
RC=$[ $RC + $? ]
# Add node description to sysfs
IBSYSDIR="/sys/class/infiniband"
if [ -d ${IBSYSDIR} ]; then
declare -i hca_id=1
for hca in ${IBSYSDIR}/*
do
if [ -w ${hca}/node_desc ]; then
echo -n "$(hostname | cut -f 1 -d .) HCA-${hca_id}" >> ${hca}/node_desc 2> /dev/null
fi
let hca_id++
done
fi
errata_58
errata_56
exit $RC