From 58f1cc011dc505b7402bf001077d523da23d2852 Mon Sep 17 00:00:00 2001 From: Peter Hedenskog Date: Mon, 13 Aug 2012 16:38:57 +0200 Subject: [PATCH] now outputs which links that are crawled --- sitespeed.io | 69 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 49 insertions(+), 20 deletions(-) diff --git a/sitespeed.io b/sitespeed.io index 3b81cadf3..ec6f4b98d 100755 --- a/sitespeed.io +++ b/sitespeed.io @@ -50,22 +50,12 @@ NOW=$(date +"%Y-%m-%d-%H-%M-%S") DATE=$(date) echo "Will crawl from start point $URL with depth $DEPTH ... this can take a while" -# remove the protocol + +# remove the protocol NOPROTOCOL=${URL#*//} HOST=${NOPROTOCOL%%/*} -RETRIES=1 -index=0 -links=$(wget -r -l $DEPTH -nd -t $RETRIES -e robots=off --no-check-certificate --follow-tags=a --spider $USER $PASSWORD $URL 2>&1 | while read line -do - echo "$line" | grep -E "\-\-\d{4}" | cut -d " " -f 4 -done) - -result=($(printf '%s\n' "${links[@]}"|sort|uniq)) - -echo "Fetched ${#result[@]} pages" - -# Setup dirs +# Setup dirs REPORT_DIR="sitespeed-result/sitespeed-$HOST-$NOW" REPORT_DATA_DIR="$REPORT_DIR/data" REPORT_DATA_PAGES_DIR="$REPORT_DATA_DIR/pages" @@ -73,6 +63,45 @@ mkdir -p $REPORT_DIR mkdir $REPORT_DATA_DIR mkdir $REPORT_DATA_PAGES_DIR +RETRIES=1 +index=0 + +## Make sure we fetch pages that really exist +isPageVerified=false + +echo "Will start fetching all a links ..." +wget -r -l $DEPTH -nd -t $RETRIES -e robots=off --no-check-certificate --follow-tags=a --spider $USER $PASSWORD $URL 2>&1 | while read line +do + + ## Depends on the output message of the wget, not so clean + if [[ "$line" == *Spider* ]] + then + isPageVerified=true + fi + + ## Only take care if urls that exist + if $isPageVerified + then + if [[ "$line" == *http* ]] + then + echo "$line" | grep -E "\-\-\d{4}" | cut -d " " -f 4 + echo "$line" | grep -E "\-\-\d{4}" | cut -d " " -f 4 >> $REPORT_DATA_DIR/urls.txt + isPageVerified=false + fi + fi + +done + +## Remove duplicates +cat $REPORT_DATA_DIR/urls.txt | sort -u > $REPORT_DATA_DIR/urls-uniq.txt +mv $REPORT_DATA_DIR/urls-uniq.txt $REPORT_DATA_DIR/urls.txt + +result=( ) +while read txt ; do + result[${#result[@]}]=$txt +done < $REPORT_DATA_DIR/urls.txt + +echo "Fetched ${#result[@]} pages" echo '' >> $REPORT_DATA_DIR/result.xml @@ -91,13 +120,6 @@ do done echo ''>> "$REPORT_DATA_DIR/result.xml" -echo 'Create the pages.html' -java -jar dependencies/xml-velocity-1.0-full.jar $REPORT_DATA_DIR/result.xml report/velocity/pages.vm report/properties/pages.properties $REPORT_DIR/pages.html - -echo 'Create the summary: index.html' -java -jar dependencies/xml-velocity-1.0-full.jar $REPORT_DATA_DIR/result.xml report/velocity/summary.vm report/properties/summary.properties $REPORT_DIR/index.html - - echo 'Create individual pages' for file in $REPORT_DATA_PAGES_DIR/* do @@ -105,6 +127,13 @@ do java -jar dependencies/xml-velocity-1.0-full.jar $file report/velocity/page.vm report/properties/page.properties $REPORT_DIR/$filename.html done +echo 'Create the pages.html' +java -jar dependencies/xml-velocity-1.0-full.jar $REPORT_DATA_DIR/result.xml report/velocity/pages.vm report/properties/pages.properties $REPORT_DIR/pages.html + +echo 'Create the summary: index.html' +java -jar dependencies/xml-velocity-1.0-full.jar $REPORT_DATA_DIR/result.xml report/velocity/summary.vm report/properties/summary.properties $REPORT_DIR/index.html + + #copy the rest of the files mkdir $REPORT_DIR/css