Skip to content

Commit

Permalink
feat(na): add manual link checker scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
mrjones-plip committed Feb 7, 2025
1 parent 65711ed commit d531040
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 0 deletions.
47 changes: 47 additions & 0 deletions .github/scripts/check.urls.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/bin/bash

# Function to get HTTP response code of a URL
get_response_code() {
local url=$1
local response_code=$(curl -s -o /dev/null -w "%{http_code}" "$url")
echo "$response_code"
}

# Function to check for meta refresh tag in HTML content
check_meta_refresh() {
local html_content=$1
url=$2
if grep -q '<meta http-equiv="refresh"' <<< "$html_content"; then
local redirect_url=$(grep -oP 'url=[^"]+' <<< "$html_content" | cut -d'=' -f2)
local redirect_response_code=$(get_response_code "$redirect_url")
echo "${url} Is redirected! Result is:"
echo " -> $redirect_url $redirect_response_code "
fi
}

run_checks(){
# Loop through each URL in the file
while IFS= read -r url; do
# Get HTTP response code, if it's not 200, print it so they know
response_code=$(get_response_code "$url")
if [ "$response_code" -ne 200 ]; then
echo "$url $response_code"
fi

# If response code is 200, check for meta refresh tag
if [ "$response_code" -eq 200 ]; then
html_content=$(curl -s "$url")
check_meta_refresh "$html_content" "$url"
fi
done < urls.txt
}

echo;echo "Are you on test branch running hugo on http://localhost:1313 and already run get.urls.sh?";echo
read -p "\"y\" to proceed, \"n\" to cancel " -n 1 -r
echo # (optional) move to a new line
if [[ ! $REPLY =~ ^[nN]$ ]]
then
run_checks
echo;echo "Done!";echo
fi

60 changes: 60 additions & 0 deletions .github/scripts/get.urls.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/bin/bash

# Function to crawl URLs recursively
function crawl_urls {
local base_url="$1"
local path="$2"
local url="$base_url$path"
local visited_urls=("${@:3}")

# Check if the URL has already been visited
if [[ " ${visited_urls[@]} " =~ " $url " ]]; then
return
fi

# Add the current URL to the visited list
visited_urls+=("$url")

# Fetch the HTML content of the URL and suppress all output
html_content=$(wget -qO- "$url" 2>/dev/null)
wget_exit_status=$?

# Check if wget command was successful
if [ $wget_exit_status -ne 0 ]; then
return
fi

# Extract all anchor tags and their href attributes
local links=$(echo "$html_content" | grep -oE '<a [^>]+>' | grep -oE 'href="([^"#]+)"' | sed -e 's/^href="//' -e 's/"$//')

# Output each URL found under the current URL
for link in $links; do
# Construct absolute URL if the link is relative
if [[ $link == /* ]]; then
link="$base_url$link"
fi

# Check if the URL is under the specified path and has not been visited before
if [[ $link == "$base_url$path/"* && ! " ${visited_urls[@]} " =~ " $link " ]]; then
echo "$link"
# Recursively crawl the URL
crawl_urls "$base_url" "$path" "$link" "${visited_urls[@]}"
fi
done
}

echo;echo "Are you on 'main' branch and running hugo on http://localhost:1313?";echo
read -p "\"y\" to proceed, \"n\" to cancel " -n 1 -r
echo # (optional) move to a new line
if [[ ! $REPLY =~ ^[nN]$ ]]
then
# Start crawling from the base URL with the specified path
base_url="http://localhost:1313"
path=""
declare -a visited_urls=()
crawl_urls "$base_url" "$path" "${visited_urls[@]}" | sort -u > urls.txt
count=$(wc -l urls.txt)
echo "Saved $count URLs in urls.txt"
fi


1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ tech-doc-hugo
.DS_Store
.idea
.hugo_build.lock
urls.txt

0 comments on commit d531040

Please sign in to comment.