From 58f1cc011dc505b7402bf001077d523da23d2852 Mon Sep 17 00:00:00 2001
From: Peter Hedenskog <peter@soulgalore.com>
Date: Mon, 13 Aug 2012 16:38:57 +0200
Subject: [PATCH] now outputs which links that are crawled

---
 sitespeed.io | 69 +++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 49 insertions(+), 20 deletions(-)
diff --git a/sitespeed.io b/sitespeed.io
index 3b81cadf3..ec6f4b98d 100755
--- a/sitespeed.io
+++ b/sitespeed.io
@@ -50,22 +50,12 @@ NOW=$(date +"%Y-%m-%d-%H-%M-%S")
 DATE=$(date) 
 echo "Will crawl from start point $URL with depth $DEPTH ... this can take a while"
 
-# remove the protocol
+
+# remove the protocol                                                                                                                                                            
 NOPROTOCOL=${URL#*//}
 HOST=${NOPROTOCOL%%/*}
-RETRIES=1
-index=0
 
-links=$(wget -r -l $DEPTH -nd -t $RETRIES -e robots=off --no-check-certificate --follow-tags=a --spider $USER $PASSWORD $URL 2>&1 | while read line
-do
-     echo "$line" | grep -E "\-\-\d{4}" | cut -d " " -f 4
-done)
-
-result=($(printf '%s\n' "${links[@]}"|sort|uniq))
-
-echo "Fetched ${#result[@]} pages" 
-
-# Setup dirs
+# Setup dirs                                                                                                                                                                    
 REPORT_DIR="sitespeed-result/sitespeed-$HOST-$NOW"
 REPORT_DATA_DIR="$REPORT_DIR/data"
 REPORT_DATA_PAGES_DIR="$REPORT_DATA_DIR/pages"
@@ -73,6 +63,45 @@ mkdir -p $REPORT_DIR
 mkdir $REPORT_DATA_DIR
 mkdir $REPORT_DATA_PAGES_DIR
 
+RETRIES=1
+index=0
+
+## Make sure we fetch pages that really exist
+isPageVerified=false
+
+echo "Will start fetching all a links ..."
+wget -r -l $DEPTH -nd -t $RETRIES -e robots=off --no-check-certificate --follow-tags=a --spider $USER $PASSWORD $URL 2>&1 | while read line
+do
+ 
+   ## Depends on the output message of the wget, not so clean
+   if [[ "$line" == *Spider* ]]
+    then
+        isPageVerified=true
+   fi
+
+   ## Only take care if urls that exist
+   if $isPageVerified
+   then
+       if [[ "$line" == *http* ]]
+       then
+	   echo "$line" | grep -E "\-\-\d{4}" | cut -d " " -f 4
+	   echo "$line" | grep -E "\-\-\d{4}" | cut -d " " -f 4 >> $REPORT_DATA_DIR/urls.txt
+	   isPageVerified=false
+       fi
+    fi
+
+done
+
+## Remove duplicates
+cat $REPORT_DATA_DIR/urls.txt | sort -u > $REPORT_DATA_DIR/urls-uniq.txt
+mv $REPORT_DATA_DIR/urls-uniq.txt $REPORT_DATA_DIR/urls.txt
+
+result=( )
+while read txt ; do
+   result[${#result[@]}]=$txt
+done < $REPORT_DATA_DIR/urls.txt
+
+echo "Fetched ${#result[@]} pages" 
 
 echo '<?xml version="1.0" encoding="UTF-8"?><document host="'$HOST'" url="'$URL'" date="'$DATE'">' >> $REPORT_DATA_DIR/result.xml
 
@@ -91,13 +120,6 @@ do
 done
 echo '</document>'>> "$REPORT_DATA_DIR/result.xml"
 
-echo 'Create the pages.html'
-java -jar dependencies/xml-velocity-1.0-full.jar $REPORT_DATA_DIR/result.xml report/velocity/pages.vm report/properties/pages.properties $REPORT_DIR/pages.html
-
-echo 'Create the summary: index.html'
-java -jar dependencies/xml-velocity-1.0-full.jar $REPORT_DATA_DIR/result.xml report/velocity/summary.vm report/properties/summary.properties $REPORT_DIR/index.html
-
-
 echo 'Create individual pages'
 for file in $REPORT_DATA_PAGES_DIR/*
 do
@@ -105,6 +127,13 @@ do
  java -jar dependencies/xml-velocity-1.0-full.jar $file report/velocity/page.vm report/properties/page.properties $REPORT_DIR/$filename.html    
 done
 
+echo 'Create the pages.html'
+java -jar dependencies/xml-velocity-1.0-full.jar $REPORT_DATA_DIR/result.xml report/velocity/pages.vm report/properties/pages.properties $REPORT_DIR/pages.html
+
+echo 'Create the summary: index.html'
+java -jar dependencies/xml-velocity-1.0-full.jar $REPORT_DATA_DIR/result.xml report/velocity/summary.vm report/properties/summary.properties $REPORT_DIR/index.html
+
+
 
 #copy the rest of the files
 mkdir $REPORT_DIR/css