Fixing a page that shows pages that got 404 for the crawl #50

This commit is contained in:
Peter Hedenskog 2012-12-16 21:04:09 +01:00
parent 348e4b4183
commit b3025435df
4 changed files with 23 additions and 4 deletions

View File

@ -25,7 +25,7 @@ package:
@cp sitespeed.io CHANGELOG LICENSE $(BUILD)/
@cp $(DEP)/LICENSE.txt $(BUILD)/$(DEP)/
@cp $(DEP)/crawler-1.1-full.jar $(BUILD)/$(DEP)/
@cp $(DEP)/crawler-1.1.1-full.jar $(BUILD)/$(DEP)/
@cp $(DEP)/crawler.properties $(BUILD)/$(DEP)/
@cp $(DEP)/xml-velocity-1.3-full.jar $(BUILD)/$(DEP)/
@cp $(DEP)/rasterize.js $(BUILD)/$(DEP)/

View File

@ -203,7 +203,7 @@ NOPROTOCOL=${URL#*//}
HOST=${NOPROTOCOL%%/*}
# Jar files
CRAWLER_JAR=crawler-1.1-full.jar
CRAWLER_JAR=crawler-1.1.1-full.jar
VELOCITY_JAR=xml-velocity-1.3-full.jar
HTMLCOMPRESSOR_JAR=htmlcompressor-1.5.3.jar
@ -227,7 +227,7 @@ if $OUTPUT_IMAGES
mkdir $REPORT_IMAGE_PAGES_DIR
fi
java -Xmx"$JAVA_HEAP"m -Xms"$JAVA_HEAP"m -Dcom.soulgalore.crawler.propertydir=$DEPENDENCIES_DIR/ $PROXY_CRAWLER -cp $DEPENDENCIES_DIR/$CRAWLER_JAR com.soulgalore.crawler.run.CrawlToFile -u $URL -l $DEPTH $FOLLOW_PATH $NOT_IN_URL -f $REPORT_DATA_DIR/urls.txt -ef $REPORT_DATA_DIR/nonworkingurls.txt
java -Xmx"$JAVA_HEAP"m -Xms"$JAVA_HEAP"m -Dcom.soulgalore.crawler.propertydir=$DEPENDENCIES_DIR/ $PROXY_CRAWLER -cp $DEPENDENCIES_DIR/$CRAWLER_JAR com.soulgalore.crawler.run.CrawlToFile -u $URL -l $DEPTH $FOLLOW_PATH $NOT_IN_URL -f $REPORT_DATA_DIR/urls.txt -ef $REPORT_DATA_DIR/404.txt
if [ ! -e $REPORT_DATA_DIR/urls.txt ];
then
@ -261,6 +261,25 @@ done
# make sure all processes has finished
wait
# take care of 404:s
if [ -e $REPORT_DATA_DIR/404.txt ];
then
result404=()
while read txt ; do
result404[${#result[@]}]=$txt
done < $REPORT_DATA_DIR/404.txt
echo '<?xml version="1.0" encoding="UTF-8"?><results>' > $REPORT_DATA_DIR/404.xml
for url in "${result404[@]}"
do echo "<url>$url</url>" >> $REPORT_DATA_DIR/404.xml
done
echo '</results>' >> $REPORT_DATA_DIR/404.xml
echo 'Create the 404.html'
java -Xmx"$JAVA_HEAP"m -Xms"$JAVA_HEAP"m -jar $DEPENDENCIES_DIR/$VELOCITY_JAR $REPORT_DATA_DIR/404.xml $VELOCITY_DIR/404.vm $PROPERTIES_DIR/404.properties $REPORT_DIR/404.html || exit 1
java -jar $DEPENDENCIES_DIR/$HTMLCOMPRESSOR_JAR --type html --compress-css --compress-js -o $REPORT_DIR/404.html $REPORT_DIR/404.html
fi
echo "Create result.xml"
echo '<?xml version="1.0" encoding="UTF-8"?><document host="'$HOST'" url="'$URL'" date="'$DATE'">' > $REPORT_DATA_DIR/result.xml

View File

@ -23,7 +23,7 @@
</div>
<div class="span6">
<i class="icon-asterisk"></i>Yes this is a test <i class="icon-fire icon-white"></i>
<!--Wow, <a href="nonexisting.html">this</a> is a link to a non existing page.-->
<a href="nonexisting.html">this</a> is a link to a non existing page.
</div>
</div>
</div>