Merge branch 'crawler'
This commit is contained in:
commit
2e560f1caf
Binary file not shown.
|
|
@ -5861,7 +5861,7 @@ YSLOW.registerRule({
|
|||
var css = doc.getElementsByTagName('link'),
|
||||
comps = cset.getComponentsByType('css'),
|
||||
comp, docdomain, src, offenders = {},
|
||||
offender_comps = [],
|
||||
offendercomponents = [], uniquedns = [],
|
||||
score = 100;
|
||||
|
||||
docdomain = YSLOW.util.getHostname(cset.doc_comp.url);
|
||||
|
|
@ -5880,20 +5880,23 @@ YSLOW.registerRule({
|
|||
for (var i = 0; i < comps.length; i++) {
|
||||
if (offenders[comps[i].url]) {
|
||||
if (docdomain !== YSLOW.util.getHostname(comps[i].url)) {
|
||||
offender_comps.push(comps[i]);
|
||||
offendercomponents.push(comps[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var message = offender_comps.length === 0 ? '' :
|
||||
'The following ' + YSLOW.util.plural('%num% css', offender_comps.length) +
|
||||
' are loaded from a different domain inside head, causing DNS lookups before page is rendered.';
|
||||
score -= offender_comps.length * parseInt(config.points, 10)
|
||||
uniquedns = YSLOW.util.getUniqueDomains(offendercomponents, true);
|
||||
|
||||
var message = offendercomponents.length === 0 ? '' :
|
||||
'The following ' + YSLOW.util.plural('%num% css', offendercomponents.length) +
|
||||
' are loaded from a different domain inside head, causing DNS lookups before page is rendered. Unique DNS in head that decreases the score:' + uniquedns.length + ".";
|
||||
// only punish dns lookups
|
||||
score -= uniquedns.length * parseInt(config.points, 10)
|
||||
|
||||
return {
|
||||
score: score,
|
||||
message: message,
|
||||
components: offender_comps
|
||||
components: offendercomponents
|
||||
};
|
||||
}
|
||||
});
|
||||
|
|
|
|||
55
sitespeed.io
55
sitespeed.io
|
|
@ -23,7 +23,7 @@ if (!command -v phantomjs &> /dev/null) ; then
|
|||
fi
|
||||
|
||||
if [ -z "$1" ]; then
|
||||
echo "Missing url. USAGE: ${0} http[s]://host[:port][/path/] [crawl-depth]"
|
||||
echo "Missing url. USAGE: ${0} http[s]://host[:port][/path/] [crawl-depth] [follow-path]"
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
|
|
@ -37,6 +37,14 @@ else
|
|||
DEPTH="1"
|
||||
fi
|
||||
|
||||
# Check if we should follow a specific path
|
||||
if [ "$3" != "" ]
|
||||
then
|
||||
FOLLOW_PATH="-p $3"
|
||||
else
|
||||
FOLLOW_PATH=""
|
||||
fi
|
||||
|
||||
URL="$1"
|
||||
|
||||
USER=""
|
||||
|
|
@ -44,7 +52,7 @@ PASSWORD=""
|
|||
|
||||
NOW=$(date +"%Y-%m-%d-%H-%M-%S")
|
||||
DATE=$(date)
|
||||
echo "Will crawl from start point $URL with depth $DEPTH ... this can take a while"
|
||||
echo "Will crawl from start point $URL with depth $DEPTH $FOLLOW_PATH ... this can take a while"
|
||||
|
||||
|
||||
# remove the protocol
|
||||
|
|
@ -61,42 +69,9 @@ mkdir $REPORT_DATA_DIR
|
|||
mkdir $REPORT_PAGES_DIR
|
||||
mkdir $REPORT_DATA_PAGES_DIR
|
||||
|
||||
RETRIES=1
|
||||
index=0
|
||||
isVerified=false
|
||||
isHTML=false
|
||||
#Faking firefox as useragent
|
||||
USERAGENT='Mozilla/5.0 (Windows NT 6.1; rv:15.0) Gecko/20120716 Firefox/15.0a2'
|
||||
|
||||
echo "Will start fetching all a links ..."
|
||||
|
||||
wget -r -l $DEPTH -nd -t $RETRIES -e robots=off --no-check-certificate --follow-tags=a --spider $URL 2>&1 | while read line
|
||||
do
|
||||
|
||||
# The spider option checks if a file exist, only fetch only existing
|
||||
if [[ $line == "Remote file exists"* ]]
|
||||
then
|
||||
isVerified=true
|
||||
# And only of content type html
|
||||
elif [[ $line = Length* ]] && [[ $line = *html* ]]
|
||||
then
|
||||
isHTML=true
|
||||
elif [[ $line = Length* ]]
|
||||
then
|
||||
isHTML=false
|
||||
elif ([[ $line == --* ]] && $isVerified && $isHTML && [[ "$line" == *$HOST* ]])
|
||||
then
|
||||
echo "$line" | cut -d " " -f 4
|
||||
echo "$line" | cut -d " " -f 4 >> $REPORT_DATA_DIR/urls.txt
|
||||
isVerified=false
|
||||
isHTML=false
|
||||
fi
|
||||
done
|
||||
|
||||
## Remove duplicates, always needing if we have same resources on multiple pages
|
||||
sed '/^$/d' $REPORT_DATA_DIR/urls.txt | sort -u > $REPORT_DATA_DIR/urls-uniq.txt
|
||||
mv $REPORT_DATA_DIR/urls-uniq.txt $REPORT_DATA_DIR/urls.txt
|
||||
java -Xmx256m -Xms256m -cp dependencies/crawler-0.9-full.jar com.soulgalore.crawler.run.CrawlToFile -u $URL -l $DEPTH $FOLLOW_PATH -f $REPORT_DATA_DIR/urls.txt -ef $REPORT_DATA_DIR/nonworkingurls.txt
|
||||
|
||||
# read the urls
|
||||
result=()
|
||||
while read txt ; do
|
||||
result[${#result[@]}]=$txt
|
||||
|
|
@ -104,6 +79,12 @@ done < $REPORT_DATA_DIR/urls.txt
|
|||
|
||||
echo "Fetched ${#result[@]} pages"
|
||||
|
||||
if [ ${#result[@]} == 0 ]
|
||||
then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
|
||||
echo '<?xml version="1.0" encoding="UTF-8"?><document host="'$HOST'" url="'$URL'" date="'$DATE'">' > $REPORT_DATA_DIR/result.xml
|
||||
|
||||
pagefilename=1
|
||||
|
|
|
|||
Loading…
Reference in New Issue