#!/usr/bin/env bash
#############################################################################
###########################################################################
### Created by A.M.Danischewski (c) 2017+ v1.00
### Issues: If you find any issues emai1 me at my <first name> dot
### <my last name> at gmail dot com.
###
### This is a simple program intended to grab the oft rickroll'd images from
### those sensationalist click-through image sites.
###
### This program requires (to work to full capacity) by default:
### curl, wget, sed, grep, uniq, sort
###
### This program is free software: you can redistribute it and/or modify
### it under the terms of the GNU General Public License as published by
### the Free Software Foundation, either version 3 of the License, or
### (at your option) any later version.
###
### This program is distributed in the hope that it will be useful,
### but WITHOUT ANY WARRANTY; without even the implied warranty of
### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
### GNU General Public License for more details.
###
### You should have received a copy of the GNU General Public License
### along with this program. If not, see <http://www.gnu.org/licenses/>.
###########################################################################
#############################################################################
url="${1:-http://allrookie.com/perfectly-timed-sports-moments-to-pump-you-up-for-the-upcoming-olympics/}"
html_prefix="${2:-rickroll}_"
master_file="${html_prefix}_master.txt"
start_index=${3:-1}
end_index=${4:-30}
cleanup_flag=${5:-1}
## Get all pages
for((a=${start_index};a<=${end_index};a++)); do
echo "Fetching: ${url%/}/${a}/ --> ${html_prefix}${a}.html ..."
curl -A mozilla -s "${url%/}/${a}/" > ${html_prefix}${a}.html
done
## Get all image links
while read b; do
sed 's/http/\nhttp/g' "${b}" | sed -r 's/(^.{250})(.*)/\1/g' | grep "^http" | sed -r 's/(PNG|GIF|JPEG|JPG)(.*)/\1/ig' | grep -Ei "(PNG|GIF|JPEG|JPG)$" >> "${master_file}.tmp"
done < <(ls ${html_prefix}*html)
## Uniq and sort the image links
sort "${master_file}.tmp" | uniq > "${master_file}"
rm -v "${master_file}.tmp"
## Fetch image links
while read c; do
wget --tries=3 -E -e robots=off -nc --random-wait --content-disposition --no-check-certificate -p --restrict-file-names=windows,lowercase,ascii --header "User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:19.$(($RANDOM%10))) Gecko/20100101 Firefox/19.0" -nd "${c}"
done < "${master_file}"
((${cleanup_flag})) && rm -v "${html_prefix}"*html
exit 0
Wednesday, February 8, 2017
Grab Rickroll Images - Quick w/out the Clicks
Subscribe to:
Posts (Atom)