now outputs which links that are crawled

2012-08-13 16:38:57 +02:00 · 2012-08-13 16:38:57 +02:00 · 58f1cc011d
parent 171d8fa5ef
commit 58f1cc011d
1 changed files with 49 additions and 20 deletions
--- a/sitespeed.io
+++ b/sitespeed.io
@ -50,22 +50,12 @@ NOW=$(date +"%Y-%m-%d-%H-%M-%S")
 DATE=$(date) 
 echo "Will crawl from start point $URL with depth $DEPTH ... this can take a while"

-# remove the protocol
+
+# remove the protocol                                                                                                                                                            
 NOPROTOCOL=${URL#*//}
 HOST=${NOPROTOCOL%%/*}
-RETRIES=1
-index=0

-links=$(wget -r -l $DEPTH -nd -t $RETRIES -e robots=off --no-check-certificate --follow-tags=a --spider $USER $PASSWORD $URL 2>&1 | while read line
-do
-     echo "$line" | grep -E "\-\-\d{4}" | cut -d " " -f 4
-done)
-
-result=($(printf '%s\n' "${links[@]}"|sort|uniq))
-
-echo "Fetched ${#result[@]} pages" 
-
-# Setup dirs
+# Setup dirs                                                                                                                                                                    
 REPORT_DIR="sitespeed-result/sitespeed-$HOST-$NOW"
 REPORT_DATA_DIR="$REPORT_DIR/data"
 REPORT_DATA_PAGES_DIR="$REPORT_DATA_DIR/pages"
@ -73,6 +63,45 @@ mkdir -p $REPORT_DIR
 mkdir $REPORT_DATA_DIR
 mkdir $REPORT_DATA_PAGES_DIR

+RETRIES=1
+index=0
+
+## Make sure we fetch pages that really exist
+isPageVerified=false
+
+echo "Will start fetching all a links ..."
+wget -r -l $DEPTH -nd -t $RETRIES -e robots=off --no-check-certificate --follow-tags=a --spider $USER $PASSWORD $URL 2>&1 | while read line
+do
+ 
+   ## Depends on the output message of the wget, not so clean
+   if [[ "$line" == *Spider* ]]
+    then
+        isPageVerified=true
+   fi
+
+   ## Only take care if urls that exist
+   if $isPageVerified
+   then
+       if [[ "$line" == *http* ]]
+       then
+	   echo "$line" | grep -E "\-\-\d{4}" | cut -d " " -f 4
+	   echo "$line" | grep -E "\-\-\d{4}" | cut -d " " -f 4 >> $REPORT_DATA_DIR/urls.txt
+	   isPageVerified=false
+       fi
+    fi
+
+done
+
+## Remove duplicates
+cat $REPORT_DATA_DIR/urls.txt | sort -u > $REPORT_DATA_DIR/urls-uniq.txt
+mv $REPORT_DATA_DIR/urls-uniq.txt $REPORT_DATA_DIR/urls.txt
+
+result=( )
+while read txt ; do
+   result[${#result[@]}]=$txt
+done < $REPORT_DATA_DIR/urls.txt
+
+echo "Fetched ${#result[@]} pages" 

 echo '<?xml version="1.0" encoding="UTF-8"?><document host="'$HOST'" url="'$URL'" date="'$DATE'">' >> $REPORT_DATA_DIR/result.xml

@ -91,13 +120,6 @@ do
 done
 echo '</document>'>> "$REPORT_DATA_DIR/result.xml"

-echo 'Create the pages.html'
-java -jar dependencies/xml-velocity-1.0-full.jar $REPORT_DATA_DIR/result.xml report/velocity/pages.vm report/properties/pages.properties $REPORT_DIR/pages.html
-
-echo 'Create the summary: index.html'
-java -jar dependencies/xml-velocity-1.0-full.jar $REPORT_DATA_DIR/result.xml report/velocity/summary.vm report/properties/summary.properties $REPORT_DIR/index.html
-
-
 echo 'Create individual pages'
 for file in $REPORT_DATA_PAGES_DIR/*
 do
@ -105,6 +127,13 @@ do
 java -jar dependencies/xml-velocity-1.0-full.jar $file report/velocity/page.vm report/properties/page.properties $REPORT_DIR/$filename.html    
 done

+echo 'Create the pages.html'
+java -jar dependencies/xml-velocity-1.0-full.jar $REPORT_DATA_DIR/result.xml report/velocity/pages.vm report/properties/pages.properties $REPORT_DIR/pages.html
+
+echo 'Create the summary: index.html'
+java -jar dependencies/xml-velocity-1.0-full.jar $REPORT_DATA_DIR/result.xml report/velocity/summary.vm report/properties/summary.properties $REPORT_DIR/index.html
+
+

 #copy the rest of the files
 mkdir $REPORT_DIR/css