Merge branch 'crawler'

This commit is contained in:
Peter Hedenskog 2012-10-15 11:13:55 +02:00
commit 2e560f1caf
3 changed files with 28 additions and 44 deletions

BIN
dependencies/crawler-0.9-full.jar vendored Normal file

Binary file not shown.

View File

@ -5861,7 +5861,7 @@ YSLOW.registerRule({
var css = doc.getElementsByTagName('link'),
comps = cset.getComponentsByType('css'),
comp, docdomain, src, offenders = {},
offender_comps = [],
offendercomponents = [], uniquedns = [],
score = 100;
docdomain = YSLOW.util.getHostname(cset.doc_comp.url);
@ -5880,20 +5880,23 @@ YSLOW.registerRule({
for (var i = 0; i < comps.length; i++) {
if (offenders[comps[i].url]) {
if (docdomain !== YSLOW.util.getHostname(comps[i].url)) {
offender_comps.push(comps[i]);
offendercomponents.push(comps[i]);
}
}
}
var message = offender_comps.length === 0 ? '' :
'The following ' + YSLOW.util.plural('%num% css', offender_comps.length) +
' are loaded from a different domain inside head, causing DNS lookups before page is rendered.';
score -= offender_comps.length * parseInt(config.points, 10)
uniquedns = YSLOW.util.getUniqueDomains(offendercomponents, true);
var message = offendercomponents.length === 0 ? '' :
'The following ' + YSLOW.util.plural('%num% css', offendercomponents.length) +
' are loaded from a different domain inside head, causing DNS lookups before page is rendered. Unique DNS in head that decreases the score:' + uniquedns.length + ".";
// only punish dns lookups
score -= uniquedns.length * parseInt(config.points, 10)
return {
score: score,
message: message,
components: offender_comps
components: offendercomponents
};
}
});

View File

@ -23,7 +23,7 @@ if (!command -v phantomjs &> /dev/null) ; then
fi
if [ -z "$1" ]; then
echo "Missing url. USAGE: ${0} http[s]://host[:port][/path/] [crawl-depth]"
echo "Missing url. USAGE: ${0} http[s]://host[:port][/path/] [crawl-depth] [follow-path]"
exit 1;
fi
@ -37,6 +37,14 @@ else
DEPTH="1"
fi
# Check if we should follow a specific path
if [ "$3" != "" ]
then
FOLLOW_PATH="-p $3"
else
FOLLOW_PATH=""
fi
URL="$1"
USER=""
@ -44,7 +52,7 @@ PASSWORD=""
NOW=$(date +"%Y-%m-%d-%H-%M-%S")
DATE=$(date)
echo "Will crawl from start point $URL with depth $DEPTH ... this can take a while"
echo "Will crawl from start point $URL with depth $DEPTH $FOLLOW_PATH ... this can take a while"
# remove the protocol
@ -61,42 +69,9 @@ mkdir $REPORT_DATA_DIR
mkdir $REPORT_PAGES_DIR
mkdir $REPORT_DATA_PAGES_DIR
RETRIES=1
index=0
isVerified=false
isHTML=false
#Faking firefox as useragent
USERAGENT='Mozilla/5.0 (Windows NT 6.1; rv:15.0) Gecko/20120716 Firefox/15.0a2'
echo "Will start fetching all a links ..."
wget -r -l $DEPTH -nd -t $RETRIES -e robots=off --no-check-certificate --follow-tags=a --spider $URL 2>&1 | while read line
do
# The spider option checks if a file exist, only fetch only existing
if [[ $line == "Remote file exists"* ]]
then
isVerified=true
# And only of content type html
elif [[ $line = Length* ]] && [[ $line = *html* ]]
then
isHTML=true
elif [[ $line = Length* ]]
then
isHTML=false
elif ([[ $line == --* ]] && $isVerified && $isHTML && [[ "$line" == *$HOST* ]])
then
echo "$line" | cut -d " " -f 4
echo "$line" | cut -d " " -f 4 >> $REPORT_DATA_DIR/urls.txt
isVerified=false
isHTML=false
fi
done
## Remove duplicates, always needing if we have same resources on multiple pages
sed '/^$/d' $REPORT_DATA_DIR/urls.txt | sort -u > $REPORT_DATA_DIR/urls-uniq.txt
mv $REPORT_DATA_DIR/urls-uniq.txt $REPORT_DATA_DIR/urls.txt
java -Xmx256m -Xms256m -cp dependencies/crawler-0.9-full.jar com.soulgalore.crawler.run.CrawlToFile -u $URL -l $DEPTH $FOLLOW_PATH -f $REPORT_DATA_DIR/urls.txt -ef $REPORT_DATA_DIR/nonworkingurls.txt
# read the urls
result=()
while read txt ; do
result[${#result[@]}]=$txt
@ -104,6 +79,12 @@ done < $REPORT_DATA_DIR/urls.txt
echo "Fetched ${#result[@]} pages"
if [ ${#result[@]} == 0 ]
then
exit 0
fi
echo '<?xml version="1.0" encoding="UTF-8"?><document host="'$HOST'" url="'$URL'" date="'$DATE'">' > $REPORT_DATA_DIR/result.xml
pagefilename=1