when crawling, only fetch 200 and content type of html
This commit is contained in:
parent
81395fcaad
commit
a0109ed66e
34
sitespeed.io
34
sitespeed.io
|
|
@ -1,5 +1,4 @@
|
|||
#! /bin/bash
|
||||
|
||||
#******************************************************
|
||||
# Sitespeed.io - How speedy is your site?
|
||||
#
|
||||
|
|
@ -58,25 +57,40 @@ HOST=${NOPROTOCOL%%/*}
|
|||
# Setup dirs
|
||||
REPORT_DIR="sitespeed-result/sitespeed-$HOST-$NOW"
|
||||
REPORT_DATA_DIR="$REPORT_DIR/data"
|
||||
REPORT_PAGES_DIR="$REPORT_DIR/pages"
|
||||
REPORT_DATA_PAGES_DIR="$REPORT_DATA_DIR/pages"
|
||||
mkdir -p $REPORT_DIR
|
||||
mkdir $REPORT_DATA_DIR
|
||||
mkdir $REPORT_PAGES_DIR
|
||||
mkdir $REPORT_DATA_PAGES_DIR
|
||||
|
||||
RETRIES=1
|
||||
index=0
|
||||
isVerified=false
|
||||
isHTML=false
|
||||
|
||||
echo "Will start fetching all a links ..."
|
||||
|
||||
wget -r -l $DEPTH -nd -t $RETRIES -e robots=off --no-check-certificate --follow-tags=a --spider $USER $PASSWORD $URL 2>&1 | while read line
|
||||
do
|
||||
|
||||
|
||||
if [[ $line == --* ]]
|
||||
# The spider option checks if a file exist, only fetch only existing
|
||||
if [[ $line == "Remote file exists"* ]]
|
||||
then
|
||||
isVerified=true
|
||||
# And only of content type html
|
||||
elif [[ $line = Length* ]] && [[ $line = *html* ]]
|
||||
then
|
||||
isHTML=true
|
||||
elif [[ $line = Length* ]]
|
||||
then
|
||||
isHTML=false
|
||||
elif ([[ $line == --* ]] && $isVerified && $isHTML)
|
||||
then
|
||||
## We are hitting the same url twice since spider mode, however, we should only use it when it's verified
|
||||
echo "$line" | cut -d " " -f 4
|
||||
echo "$line" | cut -d " " -f 4 >> $REPORT_DATA_DIR/urls.txt
|
||||
echo "$line" | cut -d " " -f 4 >> $REPORT_DATA_DIR/urls.txt
|
||||
isVerified=false
|
||||
isHTML=false
|
||||
fi
|
||||
done
|
||||
|
||||
|
|
@ -99,10 +113,12 @@ do
|
|||
echo "Analyzing $i"
|
||||
phantomjs dependencies/yslow.js -f xml "$i" >>"$REPORT_DATA_PAGES_DIR/$pagefilename.xml"
|
||||
# Sometimes the yslow script adds output before the xml tag, should probably be reported ...
|
||||
sed -ibak '/<?xml/,$!d' $REPORT_DATA_PAGES_DIR/$pagefilename.xml || exit 1
|
||||
|
||||
sed '/<?xml/,$!d' $REPORT_DATA_PAGES_DIR/$pagefilename.xml >> $REPORT_DATA_PAGES_DIR/bup || exit 1
|
||||
mv $REPORT_DATA_PAGES_DIR/bup $REPORT_DATA_PAGES_DIR/$pagefilename.xml
|
||||
|
||||
# Hack for adding link to the output file name
|
||||
sed -ibak 's/<results>/<results filename="'$pagefilename'">/g' $REPORT_DATA_PAGES_DIR/$pagefilename.xml || exit 1
|
||||
sed 's/<results>/<results filename="'$pagefilename'">/g' $REPORT_DATA_PAGES_DIR/$pagefilename.xml >> $REPORT_DATA_PAGES_DIR/bup || exit 1
|
||||
mv $REPORT_DATA_PAGES_DIR/bup $REPORT_DATA_PAGES_DIR/$pagefilename.xml
|
||||
sed 's/<?xml version="1.0" encoding="UTF-8"?>//g' "$REPORT_DATA_PAGES_DIR/$pagefilename.xml" >> "$REPORT_DATA_DIR/result.xml" || exit 1
|
||||
pagefilename=$[$pagefilename+1]
|
||||
done
|
||||
|
|
@ -112,7 +128,7 @@ echo 'Create individual pages'
|
|||
for file in $REPORT_DATA_PAGES_DIR/*
|
||||
do
|
||||
filename=$(basename $file .xml)
|
||||
java -jar dependencies/xml-velocity-1.0-full.jar $file report/velocity/page.vm report/properties/page.properties $REPORT_DIR/$filename.html || exit 1
|
||||
java -jar dependencies/xml-velocity-1.0-full.jar $file report/velocity/page.vm report/properties/page.properties $REPORT_PAGES_DIR/$filename.html || exit 1
|
||||
done
|
||||
|
||||
echo 'Create the pages.html'
|
||||
|
|
|
|||
Loading…
Reference in New Issue