diff --git a/dependencies/crawler-0.9-full.jar b/dependencies/crawler-0.9-full.jar new file mode 100644 index 000000000..5fd146d51 Binary files /dev/null and b/dependencies/crawler-0.9-full.jar differ diff --git a/dependencies/yslow-3.1.4-sitespeed.js b/dependencies/yslow-3.1.4-sitespeed.js index 028ec120d..b71a22c07 100644 --- a/dependencies/yslow-3.1.4-sitespeed.js +++ b/dependencies/yslow-3.1.4-sitespeed.js @@ -5861,7 +5861,7 @@ YSLOW.registerRule({ var css = doc.getElementsByTagName('link'), comps = cset.getComponentsByType('css'), comp, docdomain, src, offenders = {}, - offender_comps = [], + offendercomponents = [], uniquedns = [], score = 100; docdomain = YSLOW.util.getHostname(cset.doc_comp.url); @@ -5880,20 +5880,23 @@ YSLOW.registerRule({ for (var i = 0; i < comps.length; i++) { if (offenders[comps[i].url]) { if (docdomain !== YSLOW.util.getHostname(comps[i].url)) { - offender_comps.push(comps[i]); + offendercomponents.push(comps[i]); } } } - var message = offender_comps.length === 0 ? '' : - 'The following ' + YSLOW.util.plural('%num% css', offender_comps.length) + - ' are loaded from a different domain inside head, causing DNS lookups before page is rendered.'; - score -= offender_comps.length * parseInt(config.points, 10) + uniquedns = YSLOW.util.getUniqueDomains(offendercomponents, true); + + var message = offendercomponents.length === 0 ? '' : + 'The following ' + YSLOW.util.plural('%num% css', offendercomponents.length) + + ' are loaded from a different domain inside head, causing DNS lookups before page is rendered. Unique DNS in head that decreases the score:' + uniquedns.length + "."; + // only punish dns lookups + score -= uniquedns.length * parseInt(config.points, 10) return { score: score, message: message, - components: offender_comps + components: offendercomponents }; } }); diff --git a/sitespeed.io b/sitespeed.io index 6693c8d81..a6accd9a9 100755 --- a/sitespeed.io +++ b/sitespeed.io @@ -23,7 +23,7 @@ if (!command -v phantomjs &> /dev/null) ; then fi if [ -z "$1" ]; then - echo "Missing url. USAGE: ${0} http[s]://host[:port][/path/] [crawl-depth]" + echo "Missing url. USAGE: ${0} http[s]://host[:port][/path/] [crawl-depth] [follow-path]" exit 1; fi @@ -37,6 +37,14 @@ else DEPTH="1" fi +# Check if we should follow a specific path +if [ "$3" != "" ] +then + FOLLOW_PATH="-p $3" +else + FOLLOW_PATH="" +fi + URL="$1" USER="" @@ -44,7 +52,7 @@ PASSWORD="" NOW=$(date +"%Y-%m-%d-%H-%M-%S") DATE=$(date) -echo "Will crawl from start point $URL with depth $DEPTH ... this can take a while" +echo "Will crawl from start point $URL with depth $DEPTH $FOLLOW_PATH ... this can take a while" # remove the protocol @@ -61,42 +69,9 @@ mkdir $REPORT_DATA_DIR mkdir $REPORT_PAGES_DIR mkdir $REPORT_DATA_PAGES_DIR -RETRIES=1 -index=0 -isVerified=false -isHTML=false -#Faking firefox as useragent -USERAGENT='Mozilla/5.0 (Windows NT 6.1; rv:15.0) Gecko/20120716 Firefox/15.0a2' - -echo "Will start fetching all a links ..." - -wget -r -l $DEPTH -nd -t $RETRIES -e robots=off --no-check-certificate --follow-tags=a --spider $URL 2>&1 | while read line -do - - # The spider option checks if a file exist, only fetch only existing - if [[ $line == "Remote file exists"* ]] - then - isVerified=true - # And only of content type html - elif [[ $line = Length* ]] && [[ $line = *html* ]] - then - isHTML=true - elif [[ $line = Length* ]] - then - isHTML=false - elif ([[ $line == --* ]] && $isVerified && $isHTML && [[ "$line" == *$HOST* ]]) - then - echo "$line" | cut -d " " -f 4 - echo "$line" | cut -d " " -f 4 >> $REPORT_DATA_DIR/urls.txt - isVerified=false - isHTML=false - fi -done - -## Remove duplicates, always needing if we have same resources on multiple pages -sed '/^$/d' $REPORT_DATA_DIR/urls.txt | sort -u > $REPORT_DATA_DIR/urls-uniq.txt -mv $REPORT_DATA_DIR/urls-uniq.txt $REPORT_DATA_DIR/urls.txt +java -Xmx256m -Xms256m -cp dependencies/crawler-0.9-full.jar com.soulgalore.crawler.run.CrawlToFile -u $URL -l $DEPTH $FOLLOW_PATH -f $REPORT_DATA_DIR/urls.txt -ef $REPORT_DATA_DIR/nonworkingurls.txt +# read the urls result=() while read txt ; do result[${#result[@]}]=$txt @@ -104,6 +79,12 @@ done < $REPORT_DATA_DIR/urls.txt echo "Fetched ${#result[@]} pages" +if [ ${#result[@]} == 0 ] +then +exit 0 +fi + + echo '' > $REPORT_DATA_DIR/result.xml pagefilename=1