aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'sys-cluster/openib-osm/files/sldd.sh')
-rwxr-xr-xsys-cluster/openib-osm/files/sldd.sh251
1 files changed, 251 insertions, 0 deletions
diff --git a/sys-cluster/openib-osm/files/sldd.sh b/sys-cluster/openib-osm/files/sldd.sh
new file mode 100755
index 000000000..9604b848f
--- /dev/null
+++ b/sys-cluster/openib-osm/files/sldd.sh
@@ -0,0 +1,251 @@
+#!/bin/bash
+#
+# Copyright (c) 2006 Mellanox Technologies. All rights reserved.
+#
+# This Software is licensed under one of the following licenses:
+#
+# 1) under the terms of the "Common Public License 1.0" a copy of which is
+# available from the Open Source Initiative, see
+# http://www.opensource.org/licenses/cpl.php.
+#
+# 2) under the terms of the "The BSD License" a copy of which is
+# available from the Open Source Initiative, see
+# http://www.opensource.org/licenses/bsd-license.php.
+#
+# 3) under the terms of the "GNU General Public License (GPL) Version 2" a
+# copy of which is available from the Open Source Initiative, see
+# http://www.opensource.org/licenses/gpl-license.php.
+#
+# Licensee has the right to choose one of the above licenses.
+#
+# Redistributions of source code must retain the above copyright
+# notice and one of the license notices.
+#
+# Redistributions in binary form must reproduce both the above copyright
+# notice, one of the license notices in the documentation
+# and/or other materials provided with the distribution.
+#
+#
+# $Id: sldd.sh 7779 2006-06-07 12:05:46Z vlad $
+#
+
+# OpenSM found to have the following problem
+# when handover is performed:
+# If some of the cluster nodes are rebooted during the handover they loose their LID assignment.
+# The reason for it is that the standby SM does not obey its own Guid to LID table
+# and simply uses the discovered LIDs. If some nodes are not available for it
+# their previous LID assignment is lost forever.
+
+# The idea is to use an external daemon that will distribute
+# the semi-static LID assignment table from the master SM to all standby SMs.
+# A standby SM, becoming a master . needs to obey the copied semi static LID assignment table.
+
+# config: /etc/opensm.conf
+
+CONFIG=/etc/opensm.conf
+
+SLDD_DEBUG=${SLDD_DEBUG:-0}
+
+if [ ! -f ${CONFIG} ]; then
+ [ $SLDD_DEBUG -eq 1 ] &&
+ echo "${CONFIG} not found."
+ exit 0
+fi
+
+. ${CONFIG}
+
+CACHE_FILE=${CACHE_FILE:-/var/cache/osm/guid2lid}
+CACHE_DIR=$(dirname ${CACHE_FILE})
+tmp_cache=${CACHE_FILE}.tmp
+
+PING='ping -w 1 -c 1'
+
+RCP=${RCP:-/usr/bin/scp}
+RSH=${RSH:-/usr/bin/ssh}
+IFCONFIG=${IFCONFIG:-'/sbin/ifconfig -a'}
+
+declare -i SLDD_DEBUG
+RESCAN_TIME=${RESCAN_TIME:-60}
+
+if [ -z "${OSM_HOSTS}" ]; then
+ [ $SLDD_DEBUG -eq 1 ] &&
+ echo "No OpenSM servers (OSM_HOSTS) configured for the IB subnet."
+ exit 0
+fi
+
+
+declare -a arr_OSM_HOSTS
+arr_OSM_HOSTS=(${OSM_HOSTS})
+
+num_of_osm_hosts=${#arr_OSM_HOSTS[@]}
+
+if [ ${num_of_osm_hosts} -eq 1 ]; then
+ [ $SLDD_DEBUG -eq 1 ] &&
+ echo "One OpenSM server configured in the IB subnet." &&
+ echo "Nothing to be done for SLDD"
+
+ exit 0
+fi
+
+trap 'trap_handler' 15
+
+trap_handler()
+{
+ logger -i "SLDD: Exiting."
+ exit 0
+}
+
+is_alive()
+{
+ $PING $1 > /dev/null 2>&1
+ return $?
+}
+
+is_local()
+{
+ $IFCONFIG | grep -w "$1" > /dev/null 2>&1
+ return $?
+}
+
+update_remote_cache()
+{
+ /bin/rm -f ${CACHE_FILE}.upd
+ /bin/cp -a ${CACHE_FILE} ${CACHE_FILE}.upd
+
+ [ $SLDD_DEBUG -eq 1 ] &&
+ echo "Updating remote cache file"
+
+ for host in ${OSM_HOSTS}
+ do
+ # Skip local host update
+ if [ "${host}" == "${local_host}" ]; then
+ continue
+ fi
+
+ if is_alive $host; then
+ stat=$($RSH $host "/bin/mkdir -p ${CACHE_DIR} > /dev/null 2>&1; /bin/rm -f ${CACHE_FILE}.${local_host} > /dev/null 2>&1; echo \$?" | tr -d '[:space:]')
+ if [ "X${stat}" == "X0" ]; then
+ [ $SLDD_DEBUG -eq 1 ] &&
+ echo "Updating $host"
+ logger -i "SLDD: updating $host with ${CACHE_FILE}"
+ $RCP ${CACHE_FILE}.upd ${host}:${CACHE_FILE}.${local_host}
+ /bin/cp ${CACHE_FILE}.upd ${CACHE_FILE}.${host}
+ else
+ [ $SLDD_DEBUG -eq 1 ] &&
+ echo "$RSH to $host failed."
+ logger -i "SLDD: Failed to update $host with ${CACHE_FILE}. $RSH without password should be enabled"
+ exit 5
+ fi
+ else
+ [ $SLDD_DEBUG -eq 1 ] &&
+ echo "$host is down."
+ continue
+ fi
+ done
+}
+
+get_latest_remote_cache()
+{
+ # Find most updated remote cache file (the suffix should be like ip address: *.*.*.*)
+ echo -n "$(/bin/ls -1t ${CACHE_FILE}.*.* 2> /dev/null | head -1)"
+}
+
+get_largest_remote_cache()
+{
+ # Find largest (size) remote cache file (the suffix should be like ip address: *.*.*.*)
+ echo -n "$(/bin/ls -1S ${CACHE_FILE}.*.* 2> /dev/null | head -1)"
+}
+
+swap_cache_files()
+{
+ /bin/rm -f ${CACHE_FILE}.old
+ /bin/mv ${CACHE_FILE} ${CACHE_FILE}.old
+ /bin/cp ${largest_remote_cache} ${CACHE_FILE}
+ touch ${CACHE_FILE}.tmp
+}
+
+# Find local host in the osm hosts list
+local_host=""
+for host in ${OSM_HOSTS}
+do
+ if is_local $host; then
+ local_host=${host}
+ fi
+done
+
+# Get cache file info
+declare -i new_size=0
+declare -i last_size=0
+declare -i largest_remote_cache_size=0
+
+if [ -e ${CACHE_FILE} ]; then
+ last_size=$(du -b ${CACHE_FILE} | awk '{print$1}' | tr -d '[:space:]')
+else
+ touch ${CACHE_FILE} ${CACHE_FILE}.tmp
+fi
+
+# if [ ${last_size} -gt 0 ]; then
+# # First time update
+# update_remote_cache
+# fi
+
+while true
+do
+ if [ -s "${CACHE_FILE}" ]; then
+ new_size=$(du -b ${CACHE_FILE} | awk '{print$1}' | tr -d '[:space:]')
+ # Check if local cache file grew from its last version or the time stamp changed
+ if [ ${new_size} -gt ${last_size} ] ||
+ [ "$(/bin/ls -1t ${CACHE_FILE} ${CACHE_FILE}.tmp 2> /dev/null | head -1)" != "${CACHE_FILE}.tmp" ]; then
+ largest_remote_cache=$(get_largest_remote_cache)
+ if [[ -n "${largest_remote_cache}" && -s "${largest_remote_cache}" ]]; then
+ largest_remote_cache_size=$(du -b ${largest_remote_cache} 2> /dev/null | awk '{print$1}' | tr -d '[:space:]')
+ else
+ largest_remote_cache_size=0
+ fi
+
+ # Check if local cache file larger than remote chache file
+ if [ ${new_size} -gt ${largest_remote_cache_size} ]; then
+ [ $SLDD_DEBUG -eq 1 ] &&
+ echo "Local cache file larger then remote. Update remote cache files"
+ last_size=${new_size}
+ update_remote_cache
+ continue
+ fi
+ fi
+
+ largest_remote_cache=$(get_largest_remote_cache)
+ if [[ -n "${largest_remote_cache}" && -s "${largest_remote_cache}" ]]; then
+ largest_remote_cache_size=$(du -b ${largest_remote_cache} 2> /dev/null | awk '{print$1}' | tr -d '[:space:]')
+ else
+ largest_remote_cache_size=0
+ fi
+
+ # Update local cache file from remote
+ if [ ${largest_remote_cache_size} -gt ${new_size} ]; then
+ [ $SLDD_DEBUG -eq 1 ] &&
+ echo "Local cache file shorter then remote. Use ${largest_remote_cache}"
+ logger -i "SLDD: updating local cache file with ${largest_remote_cache}"
+ swap_cache_files
+ last_size=${largest_remote_cache_size}
+ fi
+
+ else # The local cache file is empty
+ [ $SLDD_DEBUG -eq 1 ] &&
+ echo "${CACHE_FILE} is empty"
+
+ largest_remote_cache=$(get_largest_remote_cache)
+ if [[ -n "${largest_remote_cache}" && -s "${largest_remote_cache}" ]]; then
+ # Copy it to the current cache
+ [ $SLDD_DEBUG -eq 1 ] &&
+ echo "Local cache file is empty. Use ${largest_remote_cache}"
+ logger -i "SLDD: updating local cache file with ${largest_remote_cache}"
+ swap_cache_files
+ fi
+
+ fi
+
+ [ $SLDD_DEBUG -eq 1 ] &&
+ echo "Sleeping ${RESCAN_TIME} seconds."
+ sleep ${RESCAN_TIME}
+
+done