#!/usr/bin/env bash # based on https://github.com/kajeagentspi/Datahoarder set -e URL=$1 ROOT_PATH=$2 LIST=./list.txt MAX_CONNECTIONS_PER_SERVER=16 USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36" usage() { cat < $logfile.tmp while read line; do sed -i "\|$line|d" $logfile done < $logfile.tmp cat $logfile | grep -i '^--[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9] [0-9][0-9]:[0-9][0-9]:[0-9][0-9]--' | \ grep '[^'/']$' | sed -e 's/^--[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9] [0-9][0-9]:[0-9][0-9]:[0-9][0-9]-- //g' > $LIST #Delete the folder made by wget (deletes all empty directories in the directory this script is run find . -type d -empty -delete #If you have a fix for this contact me since it should only delete the folder created by wget } if [[ -z $1 || -z $2 || $# -ge 3 ]]; then usage exit 1 fi echo "Creating list of urls..." spider echo "List created!"