#45 now only fetch from original domain

This commit is contained in:
Peter Hedenskog 2012-10-12 22:37:18 +02:00
parent 7c39363610
commit ad8181ad22
1 changed files with 3 additions and 3 deletions

View File

@ -70,13 +70,13 @@ USERAGENT='Mozilla/5.0 (Windows NT 6.1; rv:15.0) Gecko/20120716 Firefox/15.0a2'
echo "Will start fetching all a links ..."
wget -r -l $DEPTH -nd -t $RETRIES -e robots=off --no-check-certificate --follow-tags=a --spider -U $USERAGENT $URL 2>&1 | while read line
wget -r -l $DEPTH -nd -t $RETRIES -e robots=off --no-check-certificate --follow-tags=a --spider $URL 2>&1 | while read line
do
# The spider option checks if a file exist, only fetch only existing
if [[ $line == "Remote file exists"* ]]
then
isVerified=true
isVerified=true
# And only of content type html
elif [[ $line = Length* ]] && [[ $line = *html* ]]
then
@ -84,7 +84,7 @@ do
elif [[ $line = Length* ]]
then
isHTML=false
elif ([[ $line == --* ]] && $isVerified && $isHTML)
elif ([[ $line == --* ]] && $isVerified && $isHTML && [[ "$line" == *$HOST* ]])
then
echo "$line" | cut -d " " -f 4
echo "$line" | cut -d " " -f 4 >> $REPORT_DATA_DIR/urls.txt