aboutsummaryrefslogtreecommitdiff
blob: 9604b848f7de6c20d7b571da94b6e99a46f99b23 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
#!/bin/bash
#
# Copyright (c) 2006 Mellanox Technologies. All rights reserved.
#
# This Software is licensed under one of the following licenses:
#
# 1) under the terms of the "Common Public License 1.0" a copy of which is
#    available from the Open Source Initiative, see
#    http://www.opensource.org/licenses/cpl.php.
#
# 2) under the terms of the "The BSD License" a copy of which is
#    available from the Open Source Initiative, see
#    http://www.opensource.org/licenses/bsd-license.php.
#
# 3) under the terms of the "GNU General Public License (GPL) Version 2" a
#    copy of which is available from the Open Source Initiative, see
#    http://www.opensource.org/licenses/gpl-license.php.
#
# Licensee has the right to choose one of the above licenses.
#
# Redistributions of source code must retain the above copyright
# notice and one of the license notices.
#
# Redistributions in binary form must reproduce both the above copyright
# notice, one of the license notices in the documentation
# and/or other materials provided with the distribution.
#
#
#  $Id: sldd.sh 7779 2006-06-07 12:05:46Z vlad $
#

# OpenSM found to have the following problem 
# when handover is performed: 
# If some of the cluster nodes are rebooted during the handover they loose their LID assignment.
# The reason for it is that the standby SM does not obey its own Guid to LID table 
# and simply uses the discovered LIDs. If some nodes are not available for it 
# their previous LID assignment is lost forever.

# The idea is to use an external daemon that will distribute
# the semi-static LID assignment table from the master SM to all standby SMs.
# A standby SM, becoming a master . needs to obey the copied semi static LID assignment table. 

# config: /etc/opensm.conf

CONFIG=/etc/opensm.conf

SLDD_DEBUG=${SLDD_DEBUG:-0}

if [ ! -f ${CONFIG} ]; then
	[ $SLDD_DEBUG -eq 1 ] && 
	echo "${CONFIG} not found."
    	exit 0
fi

. ${CONFIG}

CACHE_FILE=${CACHE_FILE:-/var/cache/osm/guid2lid}
CACHE_DIR=$(dirname ${CACHE_FILE})
tmp_cache=${CACHE_FILE}.tmp

PING='ping -w 1 -c 1'

RCP=${RCP:-/usr/bin/scp}
RSH=${RSH:-/usr/bin/ssh}
IFCONFIG=${IFCONFIG:-'/sbin/ifconfig -a'}

declare -i SLDD_DEBUG
RESCAN_TIME=${RESCAN_TIME:-60}

if [ -z "${OSM_HOSTS}" ]; then
	[ $SLDD_DEBUG -eq 1 ] && 
	echo "No OpenSM servers (OSM_HOSTS) configured for the IB subnet."
	exit 0
fi


declare -a arr_OSM_HOSTS
arr_OSM_HOSTS=(${OSM_HOSTS})

num_of_osm_hosts=${#arr_OSM_HOSTS[@]}

if [ ${num_of_osm_hosts} -eq 1 ]; then
	[ $SLDD_DEBUG -eq 1 ] && 
	echo "One OpenSM server configured in the IB subnet." &&
	echo "Nothing to be done for SLDD"

	exit 0
fi

trap 'trap_handler' 15

trap_handler()
{
	logger -i "SLDD: Exiting."
	exit 0
}

is_alive()
{
	$PING $1 > /dev/null 2>&1
	return $?
}

is_local()
{
	$IFCONFIG | grep -w "$1" > /dev/null 2>&1
	return $?
}

update_remote_cache()
{
	/bin/rm -f ${CACHE_FILE}.upd
	/bin/cp -a ${CACHE_FILE} ${CACHE_FILE}.upd

	[ $SLDD_DEBUG -eq 1 ] &&
	echo "Updating remote cache file"

	for host in ${OSM_HOSTS}
	do
		# Skip local host update
		if [ "${host}" == "${local_host}" ]; then
			continue
		fi

		if is_alive $host; then
			stat=$($RSH $host "/bin/mkdir -p ${CACHE_DIR} > /dev/null 2>&1; /bin/rm -f ${CACHE_FILE}.${local_host} > /dev/null 2>&1; echo \$?" | tr -d '[:space:]')
			if [ "X${stat}" == "X0" ]; then
				[ $SLDD_DEBUG -eq 1 ] && 
				echo "Updating $host"
				logger -i "SLDD: updating $host with ${CACHE_FILE}"
				$RCP ${CACHE_FILE}.upd ${host}:${CACHE_FILE}.${local_host}
				/bin/cp ${CACHE_FILE}.upd ${CACHE_FILE}.${host}
			else
				[ $SLDD_DEBUG -eq 1 ] && 
				echo "$RSH to $host failed."
				logger -i "SLDD: Failed to update $host with ${CACHE_FILE}. $RSH without password should be enabled"
				exit 5
			fi
		else
			[ $SLDD_DEBUG -eq 1 ] && 
			echo "$host is down."
			continue
		fi
	done
}

get_latest_remote_cache()
{
	# Find most updated remote cache file (the suffix should be like ip address: *.*.*.*)
	echo -n "$(/bin/ls -1t ${CACHE_FILE}.*.* 2> /dev/null | head -1)"
}

get_largest_remote_cache()
{
	# Find largest (size) remote cache file (the suffix should be like ip address: *.*.*.*)
	echo -n "$(/bin/ls -1S ${CACHE_FILE}.*.* 2> /dev/null | head -1)"
}

swap_cache_files()
{
	/bin/rm -f ${CACHE_FILE}.old
	/bin/mv ${CACHE_FILE} ${CACHE_FILE}.old
	/bin/cp ${largest_remote_cache} ${CACHE_FILE}
	touch ${CACHE_FILE}.tmp
}

# Find local host in the osm hosts list
local_host=""
for host in ${OSM_HOSTS}
do
	if is_local $host; then
		local_host=${host}
	fi
done

# Get cache file info
declare -i new_size=0
declare -i last_size=0
declare -i largest_remote_cache_size=0

if [ -e ${CACHE_FILE} ]; then
	last_size=$(du -b ${CACHE_FILE} | awk '{print$1}' | tr -d '[:space:]')
else
	touch ${CACHE_FILE} ${CACHE_FILE}.tmp
fi

# if [ ${last_size} -gt 0 ]; then
# 	# First time update
# 	update_remote_cache
# fi

while true
do
	if [ -s "${CACHE_FILE}" ]; then
		new_size=$(du -b ${CACHE_FILE} | awk '{print$1}' | tr -d '[:space:]')
		# Check if local cache file grew from its last version or the time stamp changed
		if [ ${new_size} -gt ${last_size} ] ||
		   [ "$(/bin/ls -1t ${CACHE_FILE} ${CACHE_FILE}.tmp 2> /dev/null | head -1)"  != "${CACHE_FILE}.tmp" ]; then
			largest_remote_cache=$(get_largest_remote_cache)
			if [[ -n "${largest_remote_cache}" && -s "${largest_remote_cache}" ]]; then
				largest_remote_cache_size=$(du -b ${largest_remote_cache} 2> /dev/null | awk '{print$1}' | tr -d '[:space:]')
			else
				largest_remote_cache_size=0
			fi

			# Check if local cache file larger than remote chache file
			if [ ${new_size} -gt ${largest_remote_cache_size} ]; then
				[ $SLDD_DEBUG -eq 1 ] && 
				echo "Local cache file larger then remote. Update remote cache files"
				last_size=${new_size}
				update_remote_cache
				continue
			fi
		fi

		largest_remote_cache=$(get_largest_remote_cache)
		if [[ -n "${largest_remote_cache}" && -s "${largest_remote_cache}" ]]; then
			largest_remote_cache_size=$(du -b ${largest_remote_cache} 2> /dev/null | awk '{print$1}' | tr -d '[:space:]')
		else
			largest_remote_cache_size=0
		fi

		# Update local cache file from remote
		if [ ${largest_remote_cache_size} -gt ${new_size} ]; then
			[ $SLDD_DEBUG -eq 1 ] && 
			echo "Local cache file shorter then remote. Use ${largest_remote_cache}"
			logger -i "SLDD: updating local cache file with ${largest_remote_cache}"
			swap_cache_files
			last_size=${largest_remote_cache_size}
		fi

	else # The local cache file is empty
		[ $SLDD_DEBUG -eq 1 ] && 
		echo "${CACHE_FILE} is empty"

		largest_remote_cache=$(get_largest_remote_cache)
		if [[ -n "${largest_remote_cache}" && -s "${largest_remote_cache}" ]]; then
			# Copy it to the current cache
			[ $SLDD_DEBUG -eq 1 ] && 
			echo "Local cache file is empty. Use ${largest_remote_cache}"
			logger -i "SLDD: updating local cache file with ${largest_remote_cache}"
			swap_cache_files
		fi

	fi

	[ $SLDD_DEBUG -eq 1 ] && 
	echo "Sleeping ${RESCAN_TIME} seconds."
	sleep ${RESCAN_TIME}

done