when crawling, only fetch 200 and content type of html

This commit is contained in:
Peter Hedenskog 2012-08-14 15:44:23 +02:00
parent 81395fcaad
commit a0109ed66e
1 changed files with 25 additions and 9 deletions

View File

@ -1,5 +1,4 @@
#! /bin/bash
#******************************************************
# Sitespeed.io - How speedy is your site?
#
@ -58,25 +57,40 @@ HOST=${NOPROTOCOL%%/*}
# Setup dirs
REPORT_DIR="sitespeed-result/sitespeed-$HOST-$NOW"
REPORT_DATA_DIR="$REPORT_DIR/data"
REPORT_PAGES_DIR="$REPORT_DIR/pages"
REPORT_DATA_PAGES_DIR="$REPORT_DATA_DIR/pages"
mkdir -p $REPORT_DIR
mkdir $REPORT_DATA_DIR
mkdir $REPORT_PAGES_DIR
mkdir $REPORT_DATA_PAGES_DIR
RETRIES=1
index=0
isVerified=false
isHTML=false
echo "Will start fetching all a links ..."
wget -r -l $DEPTH -nd -t $RETRIES -e robots=off --no-check-certificate --follow-tags=a --spider $USER $PASSWORD $URL 2>&1 | while read line
do
if [[ $line == --* ]]
# The spider option checks if a file exist, only fetch only existing
if [[ $line == "Remote file exists"* ]]
then
isVerified=true
# And only of content type html
elif [[ $line = Length* ]] && [[ $line = *html* ]]
then
isHTML=true
elif [[ $line = Length* ]]
then
isHTML=false
elif ([[ $line == --* ]] && $isVerified && $isHTML)
then
## We are hitting the same url twice since spider mode, however, we should only use it when it's verified
echo "$line" | cut -d " " -f 4
echo "$line" | cut -d " " -f 4 >> $REPORT_DATA_DIR/urls.txt
echo "$line" | cut -d " " -f 4 >> $REPORT_DATA_DIR/urls.txt
isVerified=false
isHTML=false
fi
done
@ -99,10 +113,12 @@ do
echo "Analyzing $i"
phantomjs dependencies/yslow.js -f xml "$i" >>"$REPORT_DATA_PAGES_DIR/$pagefilename.xml"
# Sometimes the yslow script adds output before the xml tag, should probably be reported ...
sed -ibak '/<?xml/,$!d' $REPORT_DATA_PAGES_DIR/$pagefilename.xml || exit 1
sed '/<?xml/,$!d' $REPORT_DATA_PAGES_DIR/$pagefilename.xml >> $REPORT_DATA_PAGES_DIR/bup || exit 1
mv $REPORT_DATA_PAGES_DIR/bup $REPORT_DATA_PAGES_DIR/$pagefilename.xml
# Hack for adding link to the output file name
sed -ibak 's/<results>/<results filename="'$pagefilename'">/g' $REPORT_DATA_PAGES_DIR/$pagefilename.xml || exit 1
sed 's/<results>/<results filename="'$pagefilename'">/g' $REPORT_DATA_PAGES_DIR/$pagefilename.xml >> $REPORT_DATA_PAGES_DIR/bup || exit 1
mv $REPORT_DATA_PAGES_DIR/bup $REPORT_DATA_PAGES_DIR/$pagefilename.xml
sed 's/<?xml version="1.0" encoding="UTF-8"?>//g' "$REPORT_DATA_PAGES_DIR/$pagefilename.xml" >> "$REPORT_DATA_DIR/result.xml" || exit 1
pagefilename=$[$pagefilename+1]
done
@ -112,7 +128,7 @@ echo 'Create individual pages'
for file in $REPORT_DATA_PAGES_DIR/*
do
filename=$(basename $file .xml)
java -jar dependencies/xml-velocity-1.0-full.jar $file report/velocity/page.vm report/properties/page.properties $REPORT_DIR/$filename.html || exit 1
java -jar dependencies/xml-velocity-1.0-full.jar $file report/velocity/page.vm report/properties/page.properties $REPORT_PAGES_DIR/$filename.html || exit 1
done
echo 'Create the pages.html'