docs/script/check-external-links

135 строки
3.9 KiB
Plaintext
Исходник Обычный вид История

#!/usr/bin/env bash
# [start-readme]
# The script is run once per day via a scheduled GitHub Action to check all links in the site. It automatically opens an issue if it finds broken links.
# To exclude a URL from the link check, add it to `lib/excluded-links.js`.
#
# For checking internal links, see `script/check-internal-links`.
# [end-readme]
internal=""
while getopts "h?i" opt; do
case "${opt}" in
h|\?) echo "Usage:"
echo " script/check-external-links [OPTIONS] [two-letter language code]"
echo ""
echo " script/check-external-links -i Check internal links. Without this flag, check all links."
echo " script/check-external-links -h Display this help message."
exit 0
;;
i) internal=" --internalOnly"
;;
esac
done
shift $((OPTIND -1))
if [ -z "${1}" ]
then
echo "error: must provide two-letter language code"
exit 1
fi
languageCode=${1}
# Pass options to script to construct blc command
blcCommand="$(./script/get-blc-command.js ${internal} --language ${languageCode})"
# Exit if script returned an error
if test $? -eq 1
then
exit 1
fi
# Determine logfile name based on options
logfile=""
if [ -z "${internal}" ]
then
logfile="blc_output.log"
else
logfile="blc_output_internal.log"
fi
# Kill any server running in the background, then start the server
killall node >/dev/null 2>&1
node server.js >/dev/null &
sleep 5
host="http://localhost:4000"
# Check whether localhost is accessible
hostStatus=$(curl -I --silent "${host}" | head -1)
isHostOK=$(echo "${hostStatus}" | grep "[2|3][0-9][0-9]")
if [ -z "${isHostOK}" ]
then
echo "Can't connect to ${host}!"
echo ${hostStatus}
echo ${isHostOK}
exit 1
fi
# Execute blc and save output
${blcCommand[@]} > ${logfile}
# We're done with the server now, so end the process
# killall node will also terminate this script, so find and kill the specific pid
pid=$(ps aux | grep "node server.js" | grep -v "grep" | awk '{ print $2 }'); kill -INT $pid >/dev/null 2>&1
# Recheck "403 Forbidden" results due to a bug
# https://github.com/stevenvachon/broken-link-checker/issues/58
# Also recheck "429" GitHub results
urlsToRecheck=$(egrep "HTTP_4(03|29)" ${logfile} | grep -o "http.* ")
if [ ! -z "${urlsToRecheck}" ]
then
for url in ${urlsToRecheck}
do
# Curl each URL and grep for 4xx or 5xx in status code response
status=$(curl -I --silent "${url}" | head -1 | grep "[4|5][0-9][0-9]")
if [ -z "${status}" ]
then
# If no 4/5xx found, the link is NOT really broken, so remove it from the list
# This command needs to work in all implementations of sed (Mac/GNU/etc)
sed -i'.bak' -e "s|^.*$url.*$||" ${logfile}
# Remove backup file
find . -name "${logfile}.bak" | xargs rm
fi
done
fi
# Count number of broken links in output
# Ignore "308 Permanent Redirect" results, which are not actually broken
numberOfBrokenLinks=$(grep "BROKEN" ${logfile} | grep -vc HTTP_308)
brokenLinks=$(grep "BROKEN" ${logfile} | grep -v HTTP_308)
# If broken links are found, exit with status 1 so the check run fails
if [ ${numberOfBrokenLinks} -gt 0 ]
then
# Print "links" or "link" in message depending on the number found
if [ ${numberOfBrokenLinks} -gt 1 ]
then
linkOrLinks="links"
else
linkOrLinks="link"
fi
echo -e "\n${numberOfBrokenLinks} broken ${linkOrLinks} found on help.github.com\n"
echo -e "Note: links that start with 'http://localhost:4000/' are internal links.\n"
# List broken links
echo "${brokenLinks}"
# Update final number of broken links
echo -e "\n$(tail -2 ${logfile})" | sed "s|. [0-9]* broken.|. ${numberOfBrokenLinks} broken.|"
# Exit without failure when checking all links so script/open-broken-links-issue can run
if [ -z "${internal}" ]
then
exit 0
else
exit 1
fi
else
echo "All links are good!"
echo -e "\n$(tail -2 ${logfile})"
exit 0
fi