From d531040396a0ebc3297ea5ee7664fe6984411afd Mon Sep 17 00:00:00 2001
From: mrjones-plip <mrjones-plip@plip.com>
Date: Fri, 7 Feb 2025 12:29:57 -0800
Subject: [PATCH] feat(na): add manual link checker scripts

---
 .github/scripts/check.urls.sh | 47 +++++++++++++++++++++++++++
 .github/scripts/get.urls.sh   | 60 +++++++++++++++++++++++++++++++++++
 .gitignore                    |  1 +
 3 files changed, 108 insertions(+)
 create mode 100755 .github/scripts/check.urls.sh
 create mode 100755 .github/scripts/get.urls.sh
diff --git a/.github/scripts/check.urls.sh b/.github/scripts/check.urls.sh
new file mode 100755
index 000000000..89abba1de
--- /dev/null
+++ b/.github/scripts/check.urls.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# Function to get HTTP response code of a URL
+get_response_code() {
+    local url=$1
+    local response_code=$(curl -s -o /dev/null -w "%{http_code}" "$url")
+    echo "$response_code"
+}
+
+# Function to check for meta refresh tag in HTML content
+check_meta_refresh() {
+    local html_content=$1
+    url=$2
+    if grep -q '<meta http-equiv="refresh"' <<< "$html_content"; then
+        local redirect_url=$(grep -oP 'url=[^"]+' <<< "$html_content" | cut -d'=' -f2)
+        local redirect_response_code=$(get_response_code "$redirect_url")
+        echo "${url} Is redirected! Result is:"
+        echo "    -> $redirect_url $redirect_response_code "
+    fi
+}
+
+run_checks(){
+  # Loop through each URL in the file
+  while IFS= read -r url; do
+      # Get HTTP response code, if it's not 200, print it so they know
+      response_code=$(get_response_code "$url")
+      if [ "$response_code" -ne 200 ]; then
+          echo "$url $response_code"
+      fi
+
+      # If response code is 200, check for meta refresh tag
+      if [ "$response_code" -eq 200 ]; then
+          html_content=$(curl -s "$url")
+          check_meta_refresh "$html_content" "$url"
+      fi
+  done < urls.txt
+}
+
+echo;echo "Are you on test branch running hugo on http://localhost:1313 and already run get.urls.sh?";echo
+read -p "\"y\" to proceed, \"n\" to cancel " -n 1 -r
+echo    # (optional) move to a new line
+if [[ ! $REPLY =~ ^[nN]$ ]]
+then
+  run_checks
+  echo;echo "Done!";echo
+fi
+
diff --git a/.github/scripts/get.urls.sh b/.github/scripts/get.urls.sh
new file mode 100755
index 000000000..f0a83b654
--- /dev/null
+++ b/.github/scripts/get.urls.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+# Function to crawl URLs recursively
+function crawl_urls {
+    local base_url="$1"
+    local path="$2"
+    local url="$base_url$path"
+    local visited_urls=("${@:3}")
+
+    # Check if the URL has already been visited
+    if [[ " ${visited_urls[@]} " =~ " $url " ]]; then
+        return
+    fi
+
+    # Add the current URL to the visited list
+    visited_urls+=("$url")
+
+    # Fetch the HTML content of the URL and suppress all output
+    html_content=$(wget -qO- "$url" 2>/dev/null)
+    wget_exit_status=$?
+
+    # Check if wget command was successful
+    if [ $wget_exit_status -ne 0 ]; then
+        return
+    fi
+
+    # Extract all anchor tags and their href attributes
+    local links=$(echo "$html_content" | grep -oE '<a [^>]+>' | grep -oE 'href="([^"#]+)"' | sed -e 's/^href="//' -e 's/"$//')
+
+    # Output each URL found under the current URL
+    for link in $links; do
+        # Construct absolute URL if the link is relative
+        if [[ $link == /* ]]; then
+            link="$base_url$link"
+        fi
+
+        # Check if the URL is under the specified path and has not been visited before
+        if [[ $link == "$base_url$path/"* && ! " ${visited_urls[@]} " =~ " $link " ]]; then
+            echo "$link"
+            # Recursively crawl the URL
+            crawl_urls "$base_url" "$path" "$link" "${visited_urls[@]}"
+        fi
+    done
+}
+
+echo;echo "Are you on 'main' branch and running hugo on http://localhost:1313?";echo
+read -p "\"y\" to proceed, \"n\" to cancel " -n 1 -r
+echo    # (optional) move to a new line
+if [[ ! $REPLY =~ ^[nN]$ ]]
+then
+  # Start crawling from the base URL with the specified path
+  base_url="http://localhost:1313"
+  path=""
+  declare -a visited_urls=()
+  crawl_urls "$base_url" "$path" "${visited_urls[@]}" | sort -u > urls.txt
+  count=$(wc -l urls.txt)
+  echo "Saved $count URLs in urls.txt"
+fi
+
+
diff --git a/.gitignore b/.gitignore
index 36fdfda29..7e1030df6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,4 @@ tech-doc-hugo
 .DS_Store
 .idea
 .hugo_build.lock
+urls.txt
\ No newline at end of file