diff --git a/rdma.conf b/rdma.conf index 67b4ddc..1e620b6 100644 --- a/rdma.conf +++ b/rdma.conf @@ -1,12 +1,14 @@ # Load IPoIB IPOIB_LOAD=yes -# Load SRP module +# Load SRP (SCSI Remote Protocol initiator support) module SRP_LOAD=no -# Load SRPT module +# Load SRPT (SCSI Remote Protocol target support) module SRPT_LOAD=no -# Load iSER module +# Load iSER (iSCSI over RDMA initiator support) module ISER_LOAD=no -# Load RDS network protocol +# Load iSERT (iSCSI over RDMA target support) module +ISERT_LOAD=no +# Load RDS (Reliable Datagram Service) network protocol RDS_LOAD=no # Should we modify the system mtrr registers? We may need to do this if you # get messages from the ib_ipath driver saying that it couldn't enable diff --git a/rdma.kernel-init b/rdma.kernel-init new file mode 100644 index 0000000..0e8d1be --- /dev/null +++ b/rdma.kernel-init @@ -0,0 +1,213 @@ +#!/bin/bash +# +# Bring up the kernel RDMA stack +# +# This is usually run automatically by systemd after a hardware activation +# event in udev has triggered a start of the rdma.service unit +# + +shopt -s nullglob + +CONFIG=/etc/rdma/rdma.conf +MTRR_SCRIPT=/usr/libexec/rdma-fixup-mtrr.awk + +LOAD_ULP_MODULES="" +LOAD_CORE_USER_MODULES="ib_umad ib_uverbs ib_ucm rdma_ucm" +LOAD_CORE_CM_MODULES="iw_cm ib_cm rdma_cm" +LOAD_CORE_MODULES="ib_core ib_mad ib_sa ib_addr" + +if [ -f $CONFIG ]; then + . $CONFIG + + if [ "${RDS_LOAD}" == "yes" ]; then + IPOIB_LOAD=yes + fi + + if [ "${IPOIB_LOAD}" == "yes" ]; then + LOAD_ULP_MODULES="ib_ipoib" + fi + + if [ "${RDS_LOAD}" == "yes" -a -f /lib/modules/`uname -r`/kernel/net/rds/rds.ko ]; then + LOAD_ULP_MODULES="$LOAD_ULP_MODULES rds" + if [ -f /lib/modules/`uname -r`/kernel/net/rds/rds_tcp.ko ]; then + LOAD_ULP_MODULES="$LOAD_ULP_MODULES rds_tcp" + fi + if [ -f /lib/modules/`uname -r`/kernel/net/rds/rds_rdma.ko ]; then + LOAD_ULP_MODULES="$LOAD_ULP_MODULES rds_rdma" + fi + fi + + if [ "${SRP_LOAD}" == "yes" ]; then + LOAD_ULP_MODULES="$LOAD_ULP_MODULES ib_srp" + fi + + if [ "${SRPT_LOAD}" == "yes" ]; then + LOAD_ULP_MODULES="$LOAD_ULP_MODULES ib_srpt" + fi + + if [ "${ISER_LOAD}" == "yes" ]; then + LOAD_ULP_MODULES="$LOAD_ULP_MODULES ib_iser" + fi + + if [ "${ISERT_LOAD}" == "yes" ]; then + LOAD_ULP_MODULES="$LOAD_ULP_MODULES ib_isert" + fi +else + LOAD_ULP_MODULES="ib_ipoib" +fi + +# If module $1 is loaded return - 0 else - 1 +is_module() +{ + /sbin/lsmod | grep -w "$1" > /dev/null 2>&1 + return $? +} + +load_modules() +{ + local RC=0 + + for module in $*; do + if ! is_module $module; then + /sbin/modprobe $module + res=$? + RC=$[ $RC + $res ] + if [ $res -ne 0 ]; then + echo + echo -n "Failed to load module $mod" + fi + fi + done + return $RC +} + +# This function is a horrible hack to work around BIOS authors that should +# be shot. Specifically, certain BIOSes will map the entire 4GB address +# space as write-back cacheable when the machine has 4GB or more of RAM, and +# then they will exclude the reserved PCI I/O addresses from that 4GB +# cacheable mapping by making on overlapping uncacheable mapping. However, +# once you do that, it is then impossible to set *any* of the PCI I/O +# address space as write-combining. This is an absolute death-knell to +# certain IB hardware. So, we unroll this mapping here. Instead of +# punching a hole in a single 4GB mapping, we redo the base 4GB mapping as +# a series of discreet mappings that effectively are the same as the 4GB +# mapping minus the hole, and then we delete the uncacheable mappings that +# are used to punch the hole. This then leaves the PCI I/O address space +# unregistered (which defaults it to uncacheable), but available for +# write-combining mappings where needed. +check_mtrr_registers() +{ + # If we actually change the mtrr registers, then the awk script will + # return true, and we need to unload the ib_ipath module if it's already + # loaded. The udevtrigger in load_hardware_modules will immediately + # reload the ib_ipath module for us, so there shouldn't be a problem. + [ -f /proc/mtrr -a -f $MTRR_SCRIPT ] && + awk -f $MTRR_SCRIPT /proc/mtrr 2>/dev/null && + if is_module ib_ipath; then + /sbin/rmmod ib_ipath + fi +} + +load_hardware_modules() +{ + local -i RC=0 + + [ "$FIXUP_MTRR_REGS" = "yes" ] && check_mtrr_registers + # We match both class NETWORK and class INFINIBAND devices since our + # iWARP hardware is listed under class NETWORK. The side effect of + # this is that we might cause a non-iWARP network driver to be loaded. + udevadm trigger --subsystem-match=pci --attr-nomatch=driver --attr-match=class=0x020000 --attr-match=class=0x0c0600 + udevadm settle + if [ -r /proc/device-tree ]; then + if [ -n "`ls /proc/device-tree | grep lhca`" ]; then + if ! is_module ib_ehca; then + load_modules ib_ehca + RC+=$? + fi + fi + fi + if is_module be2net -a ! is_module ocrdma; then + load_modules ocrdma + RC+=$? + fi + return $RC +} + +errata_58() +{ + # Check AMD chipset issue Errata #58 + if test -x /sbin/lspci && test -x /sbin/setpci; then + if ( /sbin/lspci -nd 1022:1100 | grep "1100" > /dev/null ) && + ( /sbin/lspci -nd 1022:7450 | grep "7450" > /dev/null ) && + ( /sbin/lspci -nd 15b3:5a46 | grep "5a46" > /dev/null ); then + CURVAL=`/sbin/setpci -d 1022:1100 69` + for val in $CURVAL + do + if [ "${val}" != "c0" ]; then + /sbin/setpci -d 1022:1100 69=c0 + if [ $? -eq 0 ]; then + break + else + echo "Failed to apply AMD-8131 Errata #58 workaround" + fi + fi + done + fi + fi +} + +errata_56() +{ + # Check AMD chipset issue Errata #56 + if test -x /sbin/lspci && test -x /sbin/setpci; then + if ( /sbin/lspci -nd 1022:1100 | grep "1100" > /dev/null ) && + ( /sbin/lspci -nd 1022:7450 | grep "7450" > /dev/null ) && + ( /sbin/lspci -nd 15b3:5a46 | grep "5a46" > /dev/null ); then + bus="" + # Look for devices AMD-8131 + for dev in `/sbin/setpci -v -f -d 1022:7450 19 | cut -d':' -f1,2` + do + bus=`/sbin/setpci -s $dev 19` + rev=`/sbin/setpci -s $dev 8` + # Look for Tavor attach to secondary bus of this devices + for device in `/sbin/setpci -f -s $bus: -d 15b3:5a46 19` + do + if [ $rev -lt 13 ]; then + /sbin/setpci -d 15b3:5a44 72=14 + if [ $? -eq 0 ]; then + break + else + echo + echo "Failed to apply AMD-8131 Errata #56 workaround" + fi + else + continue + fi + # If more than one device is on the bus the issue a + # warning + num=`/sbin/setpci -f -s $bus: 0 | wc -l | sed 's/\ *//g'` + if [ $num -gt 1 ]; then + echo "Warning: your current PCI-X configuration might be incorrect." + echo "see AMD-8131 Errata 56 for more details." + fi + done + done + fi + fi +} + +load_hardware_modules +RC=$[ $RC + $? ] +load_modules $LOAD_CORE_MODULES +RC=$[ $RC + $? ] +load_modules $LOAD_CORE_CM_MODULES +RC=$[ $RC + $? ] +load_modules $LOAD_CORE_USER_MODULES +RC=$[ $RC + $? ] +load_modules $LOAD_ULP_MODULES +RC=$[ $RC + $? ] + +errata_58 +errata_56 + +exit $RC diff --git a/rdma.modules-setup.sh b/rdma.modules-setup.sh new file mode 100644 index 0000000..77c1e14 --- /dev/null +++ b/rdma.modules-setup.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +check() { + [ -n "$hostonly" -a -d /sys/class/infiniband_verbs/uverbs0 ] && return 0 || return 255 + return 255 +} + +depends() { + return 0 +} + +install() { + inst /etc/rdma/rdma.conf + inst /usr/libexec/rdma-init-kernel + inst /usr/libexec/rdma-fixup-mtrr.awk + inst_multiple lspci setpci awk + inst_rules 98-rdma.rules 70-persistent-ipoib.rules +} + +installkernel() { + instmods =drivers/infiniband +} diff --git a/rdma.sbin b/rdma.sbin deleted file mode 100644 index f60d5b1..0000000 --- a/rdma.sbin +++ /dev/null @@ -1,224 +0,0 @@ -#!/bin/bash -# -# Bring up the kernel RDMA stack -# -# This is usually run automatically by systemd after a hardware activation -# event in udev has triggered a start of the rdma.service unit -# - -shopt -s nullglob - -CONFIG=/etc/rdma/rdma.conf -MTRR_SCRIPT=/usr/sbin/rdma-fixup-mtrr.awk - -LOAD_ULP_MODULES="" -LOAD_CORE_USER_MODULES="ib_umad ib_uverbs ib_ucm rdma_ucm" -LOAD_CORE_CM_MODULES="iw_cm ib_cm rdma_cm" -LOAD_CORE_MODULES="ib_core ib_mad ib_sa ib_addr" - -if [ -f $CONFIG ]; then - . $CONFIG - - if [ "${RDS_LOAD}" == "yes" ]; then - IPOIB_LOAD=yes - fi - - if [ "${IPOIB_LOAD}" == "yes" ]; then - LOAD_ULP_MODULES="ib_ipoib" - fi - - if [ "${RDS_LOAD}" == "yes" -a -f /lib/modules/`uname -r`/kernel/net/rds/rds.ko ]; then - LOAD_ULP_MODULES="$LOAD_ULP_MODULES rds" - fi - - if [ "${SRP_LOAD}" == "yes" ]; then - LOAD_ULP_MODULES="$LOAD_ULP_MODULES ib_srp" - fi - - if [ "${SRPT_LOAD}" == "yes" ]; then - LOAD_ULP_MODULES="$LOAD_ULP_MODULES ib_srpt" - fi - - if [ "${ISER_LOAD}" == "yes" ]; then - LOAD_ULP_MODULES="$LOAD_ULP_MODULES ib_iser" - fi -else - LOAD_ULP_MODULES="ib_ipoib" -fi - -# If module $1 is loaded return - 0 else - 1 -is_module() -{ - /sbin/lsmod | grep -w "$1" > /dev/null 2>&1 - return $? -} - -load_modules() -{ - local RC=0 - - for module in $*; do - if ! is_module $module; then - /sbin/modprobe $module - res=$? - RC=$[ $RC + $res ] - if [ $res -ne 0 ]; then - echo - echo -n "Failed to load module $mod" - fi - fi - done - return $RC -} - -# This function is a horrible hack to work around BIOS authors that should -# be shot. Specifically, certain BIOSes will map the entire 4GB address -# space as write-back cacheable when the machine has 4GB or more of RAM, and -# then they will exclude the reserved PCI I/O addresses from that 4GB -# cacheable mapping by making on overlapping uncacheable mapping. However, -# once you do that, it is then impossible to set *any* of the PCI I/O -# address space as write-combining. This is an absolute death-knell to -# certain IB hardware. So, we unroll this mapping here. Instead of -# punching a hole in a single 4GB mapping, we redo the base 4GB mapping as -# a series of discreet mappings that effectively are the same as the 4GB -# mapping minus the hole, and then we delete the uncacheable mappings that -# are used to punch the hole. This then leaves the PCI I/O address space -# unregistered (which defaults it to uncacheable), but available for -# write-combining mappings where needed. -check_mtrr_registers() -{ - # If we actually change the mtrr registers, then the awk script will - # return true, and we need to unload the ib_ipath module if it's already - # loaded. The udevtrigger in load_hardware_modules will immediately - # reload the ib_ipath module for us, so there shouldn't be a problem. - [ -f /proc/mtrr -a -f $MTRR_SCRIPT ] && - awk -f $MTRR_SCRIPT /proc/mtrr 2>/dev/null && - if is_module ib_ipath; then - /sbin/rmmod ib_ipath - fi -} - -load_hardware_modules() -{ - local -i RC=0 - - [ "$FIXUP_MTRR_REGS" = "yes" ] && check_mtrr_registers - # We match both class NETWORK and class INFINIBAND devices since our - # iWARP hardware is listed under class NETWORK. The side effect of - # this is that we might cause a non-iWARP network driver to be loaded. - udevadm trigger --subsystem-match=pci --attr-nomatch=driver --attr-match=class=0x020000 --attr-match=class=0x0c0600 - udevadm settle - if [ -r /proc/device-tree ]; then - if [ -n "`ls /proc/device-tree | grep lhca`" ]; then - if ! is_module ib_ehca; then - load_modules ib_ehca - RC+=$? - fi - fi - fi - if is_module cxgb3 -a ! is_module iw_cxgb3; then - load_modules iw_cxgb3 - RC+=$? - fi - if is_module mlx4_core -a ! is_module mlx4_ib; then - load_modules mlx4_ib - RC+=$? - fi - if is_module be2net -a ! is_module ocrdma; then - load_modules ocrdma - RC+=$? - fi - return $RC -} - -errata_58() -{ - # Check AMD chipset issue Errata #58 - if test -x /sbin/lspci && test -x /sbin/setpci; then - if ( /sbin/lspci -nd 1022:1100 | grep "1100" > /dev/null ) && - ( /sbin/lspci -nd 1022:7450 | grep "7450" > /dev/null ) && - ( /sbin/lspci -nd 15b3:5a46 | grep "5a46" > /dev/null ); then - CURVAL=`/sbin/setpci -d 1022:1100 69` - for val in $CURVAL - do - if [ "${val}" != "c0" ]; then - /sbin/setpci -d 1022:1100 69=c0 - if [ $? -eq 0 ]; then - break - else - echo "Failed to apply AMD-8131 Errata #58 workaround" - fi - fi - done - fi - fi -} - -errata_56() -{ - # Check AMD chipset issue Errata #56 - if test -x /sbin/lspci && test -x /sbin/setpci; then - if ( /sbin/lspci -nd 1022:1100 | grep "1100" > /dev/null ) && - ( /sbin/lspci -nd 1022:7450 | grep "7450" > /dev/null ) && - ( /sbin/lspci -nd 15b3:5a46 | grep "5a46" > /dev/null ); then - bus="" - # Look for devices AMD-8131 - for dev in `/sbin/setpci -v -f -d 1022:7450 19 | cut -d':' -f1,2` - do - bus=`/sbin/setpci -s $dev 19` - rev=`/sbin/setpci -s $dev 8` - # Look for Tavor attach to secondary bus of this devices - for device in `/sbin/setpci -f -s $bus: -d 15b3:5a46 19` - do - if [ $rev -lt 13 ]; then - /sbin/setpci -d 15b3:5a44 72=14 - if [ $? -eq 0 ]; then - break - else - echo - echo "Failed to apply AMD-8131 Errata #56 workaround" - fi - else - continue - fi - # If more than one device is on the bus the issue a - # warning - num=`/sbin/setpci -f -s $bus: 0 | wc -l | sed 's/\ *//g'` - if [ $num -gt 1 ]; then - echo "Warning: your current PCI-X configuration might be incorrect." - echo "see AMD-8131 Errata 56 for more details." - fi - done - done - fi - fi -} - -load_hardware_modules -RC=$[ $RC + $? ] -load_modules $LOAD_CORE_MODULES -RC=$[ $RC + $? ] -load_modules $LOAD_CORE_CM_MODULES -RC=$[ $RC + $? ] -load_modules $LOAD_CORE_USER_MODULES -RC=$[ $RC + $? ] -load_modules $LOAD_ULP_MODULES -RC=$[ $RC + $? ] - -# Add node description to sysfs -IBSYSDIR="/sys/class/infiniband" -if [ -d ${IBSYSDIR} ]; then - pushd $IBSYSDIR - for hca in * - do - if [ -w ${hca}/node_desc ]; then - echo -n "$(hostname -s) ${hca}" >> ${hca}/node_desc 2> /dev/null - fi - done - popd -fi - -errata_58 -errata_56 - -exit $RC diff --git a/rdma.service b/rdma.service index ef31379..fe5007e 100644 --- a/rdma.service +++ b/rdma.service @@ -4,12 +4,12 @@ Documentation=file:/etc/rdma/rdma.conf RefuseManualStop=true DefaultDependencies=false Conflicts=emergency.target emergency.service -Before=network.target remote-fs-pre.target +Before=sysinit.target [Service] Type=oneshot RemainAfterExit=yes -ExecStart=/usr/sbin/rdma-init-kernel +ExecStart=/usr/libexec/rdma-init-kernel [Install] WantedBy=sysinit.target diff --git a/rdma.spec b/rdma.spec index 4af946a..121ae47 100644 --- a/rdma.spec +++ b/rdma.spec @@ -6,7 +6,7 @@ Summary: Infiniband/iWARP Kernel Module Initializer Name: rdma Version: 2.0 -Release: 13%{?dist} +Release: 14%{?dist} License: GPLv2+ Group: System Environment/Base Source0: rdma.conf @@ -15,15 +15,17 @@ Source2: rdma.fixup-mtrr.awk Source4: rdma.ifup-ib Source5: rdma.ifdown-ib Source6: rdma.service -Source7: rdma.sbin +Source7: rdma.kernel-init Source8: rdma.udev-rules +Source9: rdma.modules-setup.sh BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) BuildArch: noarch -BuildRequires: systemd +BuildRequires: systemd-units Requires: udev >= 095 -Requires(post): systemd -Requires(preun): systemd -Requires(postun): systemd +Requires(post): systemd-units +Requires(preun): systemd-units +Requires(postun): systemd-units +%global dracutlibdir %{_prefix}/lib/dracut %description User space initialization scripts for the kernel InfiniBand/iWARP drivers @@ -37,19 +39,21 @@ rm -rf %{buildroot} install -d %{buildroot}%{_sysconfdir}/%{name} install -d %{buildroot}%{_sysconfdir}/udev/rules.d install -d %{buildroot}%{_sysconfdir}/sysconfig/network-scripts -install -d %{buildroot}%{_sbindir} +install -d %{buildroot}%{_libexecdir} install -d %{buildroot}%{_unitdir} -install -d %{buildroot}/lib/udev/rules.d +install -d %{buildroot}%{_udevrulesdir} +install -d %{buildroot}%{dracutlibdir}/modules.d/05rdma # Stuff to go into the base package install -m 0644 %{SOURCE0} %{buildroot}%{_sysconfdir}/%{name}/%{name}.conf install -m 0644 %{SOURCE1} %{buildroot}%{_sysconfdir}/udev/rules.d/70-persistent-ipoib.rules install -m 0644 %{SOURCE6} %{buildroot}%{_unitdir}/rdma.service -install -m 0755 %{SOURCE7} %{buildroot}%{_sbindir}/rdma-init-kernel -install -m 0644 %{SOURCE2} %{buildroot}%{_sbindir}/rdma-fixup-mtrr.awk +install -m 0755 %{SOURCE7} %{buildroot}%{_libexecdir}/rdma-init-kernel +install -m 0644 %{SOURCE2} %{buildroot}%{_libexecdir}/rdma-fixup-mtrr.awk install -m 0755 %{SOURCE4} %{buildroot}%{_sysconfdir}/sysconfig/network-scripts/ifup-ib install -m 0755 %{SOURCE5} %{buildroot}%{_sysconfdir}/sysconfig/network-scripts/ifdown-ib -install -m 0644 %{SOURCE8} %{buildroot}/lib/udev/rules.d/98-rdma.rules +install -m 0644 %{SOURCE8} %{buildroot}%{_udevrulesdir}/98-rdma.rules +install -m 0755 %{SOURCE9} %{buildroot}%{dracutlibdir}/modules.d/05rdma/module-setup.sh %clean rm -rf %{buildroot} @@ -69,12 +73,17 @@ rm -rf %{buildroot} %config(noreplace) %{_sysconfdir}/%{name}/%{name}.conf %config(noreplace) %{_sysconfdir}/udev/rules.d/* %{_unitdir}/%{name}.service -%{_sbindir}/rdma-init-kernel -%{_sbindir}/rdma-fixup-mtrr.awk +%{_libexecdir}/rdma-init-kernel +%{_libexecdir}/rdma-fixup-mtrr.awk %{_sysconfdir}/sysconfig/network-scripts/* -/lib/udev/rules.d/* +%{_udevrulesdir}/* +%dir %{dracutlibdir}/modules.d/05rdma +%{dracutlibdir}/modules.d/05rdma/module-setup.sh %changelog +* Tue Jul 22 2014 Doug Ledford - 2.0-14 +- Fold in improvements made in the rhel7 tree back to Fedora + * Tue Nov 26 2013 Doug Ledford - 2.0-13 - Fix bug in ifdown-ib script handling of P_Key devs - Move setting of node_desc to udev rules and make it more reliable diff --git a/rdma.udev-rules b/rdma.udev-rules index 01e01de..b988e87 100644 --- a/rdma.udev-rules +++ b/rdma.udev-rules @@ -9,7 +9,7 @@ SUBSYSTEM=="module", KERNEL=="ib_*", ACTION=="add", TAG+="systemd", ENV{SYSTEMD_ SUBSYSTEM=="module", KERNEL=="mlx*", ACTION=="add", TAG+="systemd", ENV{SYSTEMD_WANTS}="rdma.service" SUBSYSTEM=="module", KERNEL=="iw_*", ACTION=="add", TAG+="systemd", ENV{SYSTEMD_WANTS}="rdma.service" SUBSYSTEM=="module", KERNEL=="be2net", ACTION=="add", TAG+="systemd", ENV{SYSTEMD_WANTS}="rdma.service" -SUBSYSTEM=="module", KERNEL=="enic", ACTION=="add", TAG+="systemd", ENV{SYSTEMD_WANTS}="rdma.service" +SUBSYSTEM=="module", KERNEL=="usnic*", ACTION=="add", TAG+="systemd", ENV{SYSTEMD_WANTS}="rdma.service" # When we detect a new verbs device is added to the system, set the node # description on that device