From d531040396a0ebc3297ea5ee7664fe6984411afd Mon Sep 17 00:00:00 2001 From: mrjones-plip Date: Fri, 7 Feb 2025 12:29:57 -0800 Subject: [PATCH] feat(na): add manual link checker scripts --- .github/scripts/check.urls.sh | 47 +++++++++++++++++++++++++++ .github/scripts/get.urls.sh | 60 +++++++++++++++++++++++++++++++++++ .gitignore | 1 + 3 files changed, 108 insertions(+) create mode 100755 .github/scripts/check.urls.sh create mode 100755 .github/scripts/get.urls.sh diff --git a/.github/scripts/check.urls.sh b/.github/scripts/check.urls.sh new file mode 100755 index 000000000..89abba1de --- /dev/null +++ b/.github/scripts/check.urls.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# Function to get HTTP response code of a URL +get_response_code() { + local url=$1 + local response_code=$(curl -s -o /dev/null -w "%{http_code}" "$url") + echo "$response_code" +} + +# Function to check for meta refresh tag in HTML content +check_meta_refresh() { + local html_content=$1 + url=$2 + if grep -q '/dev/null) + wget_exit_status=$? + + # Check if wget command was successful + if [ $wget_exit_status -ne 0 ]; then + return + fi + + # Extract all anchor tags and their href attributes + local links=$(echo "$html_content" | grep -oE ']+>' | grep -oE 'href="([^"#]+)"' | sed -e 's/^href="//' -e 's/"$//') + + # Output each URL found under the current URL + for link in $links; do + # Construct absolute URL if the link is relative + if [[ $link == /* ]]; then + link="$base_url$link" + fi + + # Check if the URL is under the specified path and has not been visited before + if [[ $link == "$base_url$path/"* && ! " ${visited_urls[@]} " =~ " $link " ]]; then + echo "$link" + # Recursively crawl the URL + crawl_urls "$base_url" "$path" "$link" "${visited_urls[@]}" + fi + done +} + +echo;echo "Are you on 'main' branch and running hugo on http://localhost:1313?";echo +read -p "\"y\" to proceed, \"n\" to cancel " -n 1 -r +echo # (optional) move to a new line +if [[ ! $REPLY =~ ^[nN]$ ]] +then + # Start crawling from the base URL with the specified path + base_url="http://localhost:1313" + path="" + declare -a visited_urls=() + crawl_urls "$base_url" "$path" "${visited_urls[@]}" | sort -u > urls.txt + count=$(wc -l urls.txt) + echo "Saved $count URLs in urls.txt" +fi + + diff --git a/.gitignore b/.gitignore index 36fdfda29..7e1030df6 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ tech-doc-hugo .DS_Store .idea .hugo_build.lock +urls.txt \ No newline at end of file