Fixing a page that shows pages that got 404 for the crawl #50
This commit is contained in:
parent
348e4b4183
commit
b3025435df
2
Makefile
2
Makefile
|
|
@ -25,7 +25,7 @@ package:
|
|||
|
||||
@cp sitespeed.io CHANGELOG LICENSE $(BUILD)/
|
||||
@cp $(DEP)/LICENSE.txt $(BUILD)/$(DEP)/
|
||||
@cp $(DEP)/crawler-1.1-full.jar $(BUILD)/$(DEP)/
|
||||
@cp $(DEP)/crawler-1.1.1-full.jar $(BUILD)/$(DEP)/
|
||||
@cp $(DEP)/crawler.properties $(BUILD)/$(DEP)/
|
||||
@cp $(DEP)/xml-velocity-1.3-full.jar $(BUILD)/$(DEP)/
|
||||
@cp $(DEP)/rasterize.js $(BUILD)/$(DEP)/
|
||||
|
|
|
|||
Binary file not shown.
23
sitespeed.io
23
sitespeed.io
|
|
@ -203,7 +203,7 @@ NOPROTOCOL=${URL#*//}
|
|||
HOST=${NOPROTOCOL%%/*}
|
||||
|
||||
# Jar files
|
||||
CRAWLER_JAR=crawler-1.1-full.jar
|
||||
CRAWLER_JAR=crawler-1.1.1-full.jar
|
||||
VELOCITY_JAR=xml-velocity-1.3-full.jar
|
||||
HTMLCOMPRESSOR_JAR=htmlcompressor-1.5.3.jar
|
||||
|
||||
|
|
@ -227,7 +227,7 @@ if $OUTPUT_IMAGES
|
|||
mkdir $REPORT_IMAGE_PAGES_DIR
|
||||
fi
|
||||
|
||||
java -Xmx"$JAVA_HEAP"m -Xms"$JAVA_HEAP"m -Dcom.soulgalore.crawler.propertydir=$DEPENDENCIES_DIR/ $PROXY_CRAWLER -cp $DEPENDENCIES_DIR/$CRAWLER_JAR com.soulgalore.crawler.run.CrawlToFile -u $URL -l $DEPTH $FOLLOW_PATH $NOT_IN_URL -f $REPORT_DATA_DIR/urls.txt -ef $REPORT_DATA_DIR/nonworkingurls.txt
|
||||
java -Xmx"$JAVA_HEAP"m -Xms"$JAVA_HEAP"m -Dcom.soulgalore.crawler.propertydir=$DEPENDENCIES_DIR/ $PROXY_CRAWLER -cp $DEPENDENCIES_DIR/$CRAWLER_JAR com.soulgalore.crawler.run.CrawlToFile -u $URL -l $DEPTH $FOLLOW_PATH $NOT_IN_URL -f $REPORT_DATA_DIR/urls.txt -ef $REPORT_DATA_DIR/404.txt
|
||||
|
||||
if [ ! -e $REPORT_DATA_DIR/urls.txt ];
|
||||
then
|
||||
|
|
@ -261,6 +261,25 @@ done
|
|||
# make sure all processes has finished
|
||||
wait
|
||||
|
||||
# take care of 404:s
|
||||
if [ -e $REPORT_DATA_DIR/404.txt ];
|
||||
then
|
||||
result404=()
|
||||
while read txt ; do
|
||||
result404[${#result[@]}]=$txt
|
||||
done < $REPORT_DATA_DIR/404.txt
|
||||
|
||||
echo '<?xml version="1.0" encoding="UTF-8"?><results>' > $REPORT_DATA_DIR/404.xml
|
||||
for url in "${result404[@]}"
|
||||
do echo "<url>$url</url>" >> $REPORT_DATA_DIR/404.xml
|
||||
done
|
||||
echo '</results>' >> $REPORT_DATA_DIR/404.xml
|
||||
echo 'Create the 404.html'
|
||||
java -Xmx"$JAVA_HEAP"m -Xms"$JAVA_HEAP"m -jar $DEPENDENCIES_DIR/$VELOCITY_JAR $REPORT_DATA_DIR/404.xml $VELOCITY_DIR/404.vm $PROPERTIES_DIR/404.properties $REPORT_DIR/404.html || exit 1
|
||||
java -jar $DEPENDENCIES_DIR/$HTMLCOMPRESSOR_JAR --type html --compress-css --compress-js -o $REPORT_DIR/404.html $REPORT_DIR/404.html
|
||||
|
||||
fi
|
||||
|
||||
echo "Create result.xml"
|
||||
|
||||
echo '<?xml version="1.0" encoding="UTF-8"?><document host="'$HOST'" url="'$URL'" date="'$DATE'">' > $REPORT_DATA_DIR/result.xml
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@
|
|||
</div>
|
||||
<div class="span6">
|
||||
<i class="icon-asterisk"></i>Yes this is a test <i class="icon-fire icon-white"></i>
|
||||
<!--Wow, <a href="nonexisting.html">this</a> is a link to a non existing page.-->
|
||||
<a href="nonexisting.html">this</a> is a link to a non existing page.
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
|
|
|||
Loading…
Reference in New Issue