# Copyright 1999-2014 Gentoo Foundation # Distributed under the terms of the GNU General Public License v2 # $Id$ # @ECLASS: multiprocessing.eclass # @MAINTAINER: # base-system@gentoo.org # @AUTHOR: # Brian Harring # Mike Frysinger # @BLURB: parallelization with bash (wtf?) # @DESCRIPTION: # The multiprocessing eclass contains a suite of functions that allow ebuilds # to quickly run things in parallel using shell code. # # It has two modes: pre-fork and post-fork. If you don't want to dive into any # more nuts & bolts, just use the pre-fork mode. For main threads that mostly # spawn children and then wait for them to finish, use the pre-fork mode. For # main threads that do a bit of processing themselves, use the post-fork mode. # You may mix & match them for longer computation loops. # @EXAMPLE: # # @CODE # # First initialize things: # multijob_init # # # Then hash a bunch of files in parallel: # for n in {0..20} ; do # multijob_child_init md5sum data.${n} > data.${n} # done # # # Then wait for all the children to finish: # multijob_finish # @CODE if [[ -z ${_MULTIPROCESSING_ECLASS} ]]; then _MULTIPROCESSING_ECLASS=1 # @FUNCTION: bashpid # @DESCRIPTION: # Return the process id of the current sub shell. This is to support bash # versions older than 4.0 that lack $BASHPID support natively. Simply do: # echo ${BASHPID:-$(bashpid)} # # Note: Using this func in any other way than the one above is not supported. bashpid() { # Running bashpid plainly will return incorrect results. This func must # be run in a subshell of the current subshell to get the right pid. # i.e. This will show the wrong value: # bashpid # But this will show the right value: # (bashpid) sh -c 'echo ${PPID}' } # @FUNCTION: get_nproc # @USAGE: [${fallback:-1}] # @DESCRIPTION: # Attempt to figure out the number of processing units available. # If the value can not be determined, prints the provided fallback # instead. If no fallback is provided, defaults to 1. get_nproc() { local nproc # GNU if type -P nproc &>/dev/null; then nproc=$(nproc) fi # BSD if [[ -z ${nproc} ]] && type -P sysctl &>/dev/null; then nproc=$(sysctl -n hw.ncpu 2>/dev/null) fi # fallback to python2.6+ # note: this may fail (raise NotImplementedError) if [[ -z ${nproc} ]] && type -P python &>/dev/null; then nproc=$(python -c 'import multiprocessing; print(multiprocessing.cpu_count());' 2>/dev/null) fi if [[ -n ${nproc} ]]; then echo "${nproc}" else echo "${1:-1}" fi } # @FUNCTION: makeopts_jobs # @USAGE: [${MAKEOPTS}] [${inf:-999}] # @DESCRIPTION: # Searches the arguments (defaults to ${MAKEOPTS}) and extracts the jobs number # specified therein. Useful for running non-make tools in parallel too. # i.e. if the user has MAKEOPTS=-j9, this will echo "9" -- we can't return the # number as bash normalizes it to [0, 255]. If the flags haven't specified a # -j flag, then "1" is shown as that is the default `make` uses. Since there's # no way to represent infinity, we return ${inf} (defaults to 999) if the user # has -j without a number. makeopts_jobs() { [[ $# -eq 0 ]] && set -- ${MAKEOPTS} # This assumes the first .* will be more greedy than the second .* # since POSIX doesn't specify a non-greedy match (i.e. ".*?"). local jobs=$(echo " $* " | sed -r -n \ -e 's:.*[[:space:]](-[a-z]*j|--jobs[=[:space:]])[[:space:]]*([0-9]+).*:\2:p' \ -e "s:.*[[:space:]](-[a-z]*j|--jobs)[[:space:]].*:${2:-999}:p") echo ${jobs:-1} } # @FUNCTION: makeopts_loadavg # @USAGE: [${MAKEOPTS}] [${inf:-999}] # @DESCRIPTION: # Searches the arguments (defaults to ${MAKEOPTS}) and extracts the value set # for load-average. For make and ninja based builds this will mean new jobs are # not only limited by the jobs-value, but also by the current load - which might # get excessive due to I/O and not just due to CPU load. # Be aware that the returned number might be a floating-point number. Test # whether your software supports that. # If no limit is specified or --load-average is used without a number, ${inf} # (defaults to 999) is returned. makeopts_loadavg() { [[ $# -eq 0 ]] && set -- ${MAKEOPTS} # This assumes the first .* will be more greedy than the second .* # since POSIX doesn't specify a non-greedy match (i.e. ".*?"). local lavg=$(echo " $* " | sed -r -n \ -e 's:.*[[:space:]](-[a-z]*l|--(load-average|max-load)[=[:space:]])[[:space:]]*([0-9]+|[0-9]+\.[0-9]+).*:\3:p' \ -e "s:.*[[:space:]](-[a-z]*l|--(load-average|max-load))[[:space:]].*:${2:-999}:p") # Default to ${inf} since the default is to not use a load limit. echo ${lavg:-${2:-999}} } # @FUNCTION: multijob_init # @USAGE: [${MAKEOPTS}] # @DESCRIPTION: # Setup the environment for executing code in parallel. # You must call this before any other multijob function. multijob_init() { # When something goes wrong, try to wait for all the children so we # don't leave any zombies around. has wait ${EBUILD_DEATH_HOOKS} || EBUILD_DEATH_HOOKS+=" wait " # Setup a pipe for children to write their pids to when they finish. # We have to allocate two fd's because POSIX has undefined behavior # when you open a FIFO for simultaneous read/write. #487056 local pipe="${T}/multijob.pipe" mkfifo -m 600 "${pipe}" redirect_alloc_fd mj_write_fd "${pipe}" redirect_alloc_fd mj_read_fd "${pipe}" rm -f "${pipe}" # See how many children we can fork based on the user's settings. mj_max_jobs=$(makeopts_jobs "$@") mj_num_jobs=0 } # @FUNCTION: multijob_child_init # @USAGE: [--pre|--post] [command to run in background] # @DESCRIPTION: # This function has two forms. You can use it to execute a simple command # in the background (and it takes care of everything else), or you must # call this first thing in your forked child process. # # The --pre/--post options allow you to select the child generation mode. # # @CODE # # 1st form: pass the command line as arguments: # multijob_child_init ls /dev # # Or if you want to use pre/post fork modes: # multijob_child_init --pre ls /dev # multijob_child_init --post ls /dev # # # 2nd form: execute multiple stuff in the background (post fork): # ( # multijob_child_init # out=`ls` # if echo "${out}" | grep foo ; then # echo "YEAH" # fi # ) & # multijob_post_fork # # # 2nd form: execute multiple stuff in the background (pre fork): # multijob_pre_fork # ( # multijob_child_init # out=`ls` # if echo "${out}" | grep foo ; then # echo "YEAH" # fi # ) & # @CODE multijob_child_init() { local mode="pre" case $1 in --pre) mode="pre" ; shift ;; --post) mode="post"; shift ;; esac if [[ $# -eq 0 ]] ; then trap 'echo ${BASHPID:-$(bashpid)} $? >&'${mj_write_fd} EXIT trap 'exit 1' INT TERM else local ret [[ ${mode} == "pre" ]] && { multijob_pre_fork; ret=$?; } ( multijob_child_init ; "$@" ) & [[ ${mode} == "post" ]] && { multijob_post_fork; ret=$?; } return ${ret} fi } # @FUNCTION: _multijob_fork # @INTERNAL # @DESCRIPTION: # Do the actual book keeping. _multijob_fork() { [[ $# -eq 1 ]] || die "incorrect number of arguments" local ret=0 [[ $1 == "post" ]] && : $(( ++mj_num_jobs )) if [[ ${mj_num_jobs} -ge ${mj_max_jobs} ]] ; then multijob_finish_one ret=$? fi [[ $1 == "pre" ]] && : $(( ++mj_num_jobs )) return ${ret} } # @FUNCTION: multijob_pre_fork # @DESCRIPTION: # You must call this in the parent process before forking a child process. # If the parallel limit has been hit, it will wait for one child to finish # and return its exit status. multijob_pre_fork() { _multijob_fork pre "$@" ; } # @FUNCTION: multijob_post_fork # @DESCRIPTION: # You must call this in the parent process after forking a child process. # If the parallel limit has been hit, it will wait for one child to finish # and return its exit status. multijob_post_fork() { _multijob_fork post "$@" ; } # @FUNCTION: multijob_finish_one # @DESCRIPTION: # Wait for a single process to exit and return its exit code. multijob_finish_one() { [[ $# -eq 0 ]] || die "${FUNCNAME} takes no arguments" local pid ret read -r -u ${mj_read_fd} pid ret || die : $(( --mj_num_jobs )) return ${ret} } # @FUNCTION: multijob_finish # @DESCRIPTION: # Wait for all pending processes to exit and return the bitwise or # of all their exit codes. multijob_finish() { local ret=0 while [[ ${mj_num_jobs} -gt 0 ]] ; do multijob_finish_one : $(( ret |= $? )) done # Let bash clean up its internal child tracking state. wait # Do this after reaping all the children. [[ $# -eq 0 ]] || die "${FUNCNAME} takes no arguments" # No need to hook anymore. EBUILD_DEATH_HOOKS=${EBUILD_DEATH_HOOKS/ wait / } return ${ret} } # @FUNCTION: redirect_alloc_fd # @USAGE: [redirection] # @DESCRIPTION: # Find a free fd and redirect the specified file via it. Store the new # fd in the specified variable. Useful for the cases where we don't care # about the exact fd #. redirect_alloc_fd() { local var=$1 file=$2 redir=${3:-"<>"} # Make sure /dev/fd is sane on Linux hosts. #479656 if [[ ! -L /dev/fd && ${CBUILD} == *linux* ]] ; then eerror "You're missing a /dev/fd symlink to /proc/self/fd." eerror "Please fix the symlink and check your boot scripts (udev/etc...)." die "/dev/fd is broken" fi if [[ $(( (BASH_VERSINFO[0] << 8) + BASH_VERSINFO[1] )) -ge $(( (4 << 8) + 1 )) ]] ; then # Newer bash provides this functionality. eval "exec {${var}}${redir}'${file}'" else # Need to provide the functionality ourselves. local fd=10 while :; do # Make sure the fd isn't open. It could be a char device, # or a symlink (possibly broken) to something else. if [[ ! -e /dev/fd/${fd} ]] && [[ ! -L /dev/fd/${fd} ]] ; then eval "exec ${fd}${redir}'${file}'" && break fi [[ ${fd} -gt 1024 ]] && die 'could not locate a free temp fd !?' : $(( ++fd )) done : $(( ${var} = fd )) fi } fi