Merge branch 'crawler'

2012-10-15 11:13:55 +02:00 · 2012-10-15 11:13:55 +02:00 · 2e560f1caf
parent eafbc0395b be360268a6
commit 2e560f1caf
3 changed files with 28 additions and 44 deletions
--- a/dependencies/crawler-0.9-full.jar
+++ b/dependencies/crawler-0.9-full.jar
--- a/dependencies/yslow-3.1.4-sitespeed.js
+++ b/dependencies/yslow-3.1.4-sitespeed.js
@ -5861,7 +5861,7 @@ YSLOW.registerRule({
  var css = doc.getElementsByTagName('link'), 
    comps = cset.getComponentsByType('css'),
    comp, docdomain, src, offenders = {}, 
-    offender_comps = [],  
+    offendercomponents = [], uniquedns = [], 
    score = 100;
  
    docdomain = YSLOW.util.getHostname(cset.doc_comp.url);
@ -5880,20 +5880,23 @@ YSLOW.registerRule({
    for (var i = 0; i < comps.length; i++) {
      if (offenders[comps[i].url]) {
        if (docdomain !== YSLOW.util.getHostname(comps[i].url)) {
-          offender_comps.push(comps[i]);
+          offendercomponents.push(comps[i]);
        }
      }
    }

-    var message = offender_comps.length === 0 ? '' :
-      'The following ' + YSLOW.util.plural('%num% css', offender_comps.length) +
-        ' are loaded from a different domain inside head, causing DNS lookups before page is rendered.';
-    score -= offender_comps.length * parseInt(config.points, 10)
+    uniquedns = YSLOW.util.getUniqueDomains(offendercomponents, true);
+
+    var message = offendercomponents.length === 0 ? '' :
+      'The following ' + YSLOW.util.plural('%num% css', offendercomponents.length) +
+        ' are loaded from a different domain inside head, causing DNS lookups before page is rendered. Unique DNS in head that decreases the score:' + uniquedns.length + ".";
+    // only punish dns lookups    
+    score -= uniquedns.length * parseInt(config.points, 10)
  
    return {
      score: score,
      message: message,
-      components: offender_comps
+      components: offendercomponents
    };
    }
 });
--- a/sitespeed.io
+++ b/sitespeed.io
@ -23,7 +23,7 @@ if (!command -v phantomjs &> /dev/null) ; then
 fi

 if [ -z "$1" ]; then
-   echo "Missing url. USAGE: ${0} http[s]://host[:port][/path/] [crawl-depth]"
+   echo "Missing url. USAGE: ${0} http[s]://host[:port][/path/] [crawl-depth] [follow-path]"
   exit 1;
 fi

@ -37,6 +37,14 @@ else
    DEPTH="1"
 fi

+# Check if we should follow a specific path
+if [ "$3" != "" ]
+then
+    FOLLOW_PATH="-p $3"
+else
+    FOLLOW_PATH=""
+fi
+
 URL="$1"

 USER=""
@ -44,7 +52,7 @@ PASSWORD=""

 NOW=$(date +"%Y-%m-%d-%H-%M-%S")
 DATE=$(date) 
-echo "Will crawl from start point $URL with depth $DEPTH ... this can take a while"
+echo "Will crawl from start point $URL with depth $DEPTH $FOLLOW_PATH ... this can take a while"


 # remove the protocol                                                                                                                                                            
@ -61,42 +69,9 @@ mkdir $REPORT_DATA_DIR
 mkdir $REPORT_PAGES_DIR
 mkdir $REPORT_DATA_PAGES_DIR

-RETRIES=1
-index=0
-isVerified=false
-isHTML=false
-#Faking firefox as useragent
-USERAGENT='Mozilla/5.0 (Windows NT 6.1; rv:15.0) Gecko/20120716 Firefox/15.0a2'
-
-echo "Will start fetching all a links ..."
-
-wget -r -l $DEPTH -nd -t $RETRIES -e robots=off --no-check-certificate --follow-tags=a --spider $URL 2>&1 | while read line
-do
-
-    # The spider option checks if a file exist, only fetch only existing
-    if [[ $line == "Remote file exists"* ]]
-    then
-	 isVerified=true	
-    # And only of content type html
-    elif [[ $line = Length* ]] && [[ $line = *html* ]]
-	then
-	isHTML=true
-    elif [[ $line = Length* ]]
-	then
-	isHTML=false
-    elif ([[ $line == --* ]] && $isVerified && $isHTML && [[ "$line" == *$HOST* ]])
-    then
-       echo "$line" | cut -d " " -f 4
-       echo "$line" | cut -d " " -f 4 >> $REPORT_DATA_DIR/urls.txt
-       isVerified=false
-       isHTML=false
-    fi
-done
-
-## Remove duplicates, always needing if we have same resources on multiple pages
-sed '/^$/d' $REPORT_DATA_DIR/urls.txt | sort -u > $REPORT_DATA_DIR/urls-uniq.txt
-mv $REPORT_DATA_DIR/urls-uniq.txt $REPORT_DATA_DIR/urls.txt
+java -Xmx256m -Xms256m -cp dependencies/crawler-0.9-full.jar com.soulgalore.crawler.run.CrawlToFile -u $URL -l $DEPTH $FOLLOW_PATH -f $REPORT_DATA_DIR/urls.txt -ef $REPORT_DATA_DIR/nonworkingurls.txt

+# read the urls
 result=()
 while read txt ; do
   result[${#result[@]}]=$txt
@ -104,6 +79,12 @@ done < $REPORT_DATA_DIR/urls.txt

 echo "Fetched ${#result[@]} pages" 

+if  [ ${#result[@]} == 0 ]
+then
+exit 0
+fi
+
+
 echo '<?xml version="1.0" encoding="UTF-8"?><document host="'$HOST'" url="'$URL'" date="'$DATE'">' > $REPORT_DATA_DIR/result.xml

 pagefilename=1