when crawling, only fetch 200 and content type of html

2012-08-14 15:44:23 +02:00 · 2012-08-14 15:44:23 +02:00 · a0109ed66e
parent 81395fcaad
commit a0109ed66e
1 changed files with 25 additions and 9 deletions
--- a/sitespeed.io
+++ b/sitespeed.io
@ -1,5 +1,4 @@
 #! /bin/bash
-
 #******************************************************
 # Sitespeed.io - How speedy is your site?
 # 
@ -58,25 +57,40 @@ HOST=${NOPROTOCOL%%/*}
 # Setup dirs                                                                                                                                                                    
 REPORT_DIR="sitespeed-result/sitespeed-$HOST-$NOW"
 REPORT_DATA_DIR="$REPORT_DIR/data"
+REPORT_PAGES_DIR="$REPORT_DIR/pages"
 REPORT_DATA_PAGES_DIR="$REPORT_DATA_DIR/pages"
 mkdir -p $REPORT_DIR
 mkdir $REPORT_DATA_DIR
+mkdir $REPORT_PAGES_DIR
 mkdir $REPORT_DATA_PAGES_DIR

 RETRIES=1
 index=0
+isVerified=false
+isHTML=false

 echo "Will start fetching all a links ..."

 wget -r -l $DEPTH -nd -t $RETRIES -e robots=off --no-check-certificate --follow-tags=a --spider $USER $PASSWORD $URL 2>&1 | while read line
 do

-
-    if [[ $line == --* ]]  
+    # The spider option checks if a file exist, only fetch only existing
+    if [[ $line == "Remote file exists"* ]]
+    then
+	isVerified=true	
+    # And only of content type html
+    elif [[ $line = Length* ]] && [[ $line = *html* ]]
+	then
+	isHTML=true
+    elif [[ $line = Length* ]]
+	then
+	isHTML=false
+    elif ([[ $line == --* ]] && $isVerified && $isHTML)
    then
-       ## We are hitting the same url twice since spider mode, however, we should only use it when it's verified                                                                                
       echo "$line" | cut -d " " -f 4
-       echo "$line" | cut -d " " -f 4 >> $REPORT_DATA_DIR/urls.txt	
+       echo "$line" | cut -d " " -f 4 >> $REPORT_DATA_DIR/urls.txt
+       isVerified=false
+       isHTML=false
    fi
 done

@ -99,10 +113,12 @@ do
    echo "Analyzing $i"
    phantomjs dependencies/yslow.js -f xml "$i" >>"$REPORT_DATA_PAGES_DIR/$pagefilename.xml"
    # Sometimes the yslow script adds output before the xml tag, should probably be reported ...
-    sed -ibak '/<?xml/,$!d' $REPORT_DATA_PAGES_DIR/$pagefilename.xml || exit 1
-     
+    sed '/<?xml/,$!d' $REPORT_DATA_PAGES_DIR/$pagefilename.xml >> $REPORT_DATA_PAGES_DIR/bup || exit 1
+    mv $REPORT_DATA_PAGES_DIR/bup $REPORT_DATA_PAGES_DIR/$pagefilename.xml
+ 
    # Hack for adding link to the output file name
-    sed -ibak 's/<results>/<results filename="'$pagefilename'">/g' $REPORT_DATA_PAGES_DIR/$pagefilename.xml || exit 1 
+    sed 's/<results>/<results filename="'$pagefilename'">/g' $REPORT_DATA_PAGES_DIR/$pagefilename.xml >> $REPORT_DATA_PAGES_DIR/bup || exit 1
+    mv $REPORT_DATA_PAGES_DIR/bup $REPORT_DATA_PAGES_DIR/$pagefilename.xml
    sed 's/<?xml version="1.0" encoding="UTF-8"?>//g' "$REPORT_DATA_PAGES_DIR/$pagefilename.xml" >> "$REPORT_DATA_DIR/result.xml" || exit 1
    pagefilename=$[$pagefilename+1]
 done
@ -112,7 +128,7 @@ echo 'Create individual pages'
 for file in $REPORT_DATA_PAGES_DIR/*
 do
 filename=$(basename $file .xml)
- java -jar dependencies/xml-velocity-1.0-full.jar $file report/velocity/page.vm report/properties/page.properties $REPORT_DIR/$filename.html || exit 1    
+ java -jar dependencies/xml-velocity-1.0-full.jar $file report/velocity/page.vm report/properties/page.properties $REPORT_PAGES_DIR/$filename.html || exit 1    
 done

 echo 'Create the pages.html'