Wednesday, February 8, 2017

Grab Rickroll Images - Quick w/out the Clicks

#!/usr/bin/env bash 
#############################################################################
###########################################################################
### Created by A.M.Danischewski (c) 2017+ v1.00
### Issues: If you find any issues emai1 me at my <first name> dot 
###         <my last name> at gmail dot com.  
###
### This is a simple program intended to grab the oft rickroll'd images from 
### those sensationalist click-through image sites. 
### 
### This program requires (to work to full capacity) by default: 
### curl, wget, sed, grep, uniq, sort
### 
### This program is free software: you can redistribute it and/or modify
### it under the terms of the GNU General Public License as published by
### the Free Software Foundation, either version 3 of the License, or
### (at your option) any later version.
###
### This program is distributed in the hope that it will be useful,
### but WITHOUT ANY WARRANTY; without even the implied warranty of
### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
### GNU General Public License for more details.
###
### You should have received a copy of the GNU General Public License
### along with this program.  If not, see <http://www.gnu.org/licenses/>.
###########################################################################
#############################################################################
url="${1:-http://allrookie.com/perfectly-timed-sports-moments-to-pump-you-up-for-the-upcoming-olympics/}"
html_prefix="${2:-rickroll}_"
master_file="${html_prefix}_master.txt"
start_index=${3:-1}
end_index=${4:-30}
cleanup_flag=${5:-1}

 ## Get all pages 
for((a=${start_index};a<=${end_index};a++)); do 
 echo "Fetching: ${url%/}/${a}/  -->  ${html_prefix}${a}.html ..."
 curl -A mozilla -s "${url%/}/${a}/" > ${html_prefix}${a}.html
done

 ## Get all image links 
while read b; do  
 sed 's/http/\nhttp/g' "${b}" | sed -r 's/(^.{250})(.*)/\1/g' | grep "^http" | sed -r 's/(PNG|GIF|JPEG|JPG)(.*)/\1/ig' | grep -Ei "(PNG|GIF|JPEG|JPG)$" >> "${master_file}.tmp" 
done < <(ls ${html_prefix}*html)

 ## Uniq and sort the image links 
sort "${master_file}.tmp" | uniq > "${master_file}"
rm -v "${master_file}.tmp"

 ## Fetch image links 
while read c; do 
 wget --tries=3 -E -e robots=off -nc --random-wait --content-disposition --no-check-certificate -p --restrict-file-names=windows,lowercase,ascii --header "User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:19.$(($RANDOM%10))) Gecko/20100101 Firefox/19.0" -nd "${c}"
done < "${master_file}" 

((${cleanup_flag})) && rm -v "${html_prefix}"*html

exit 0

No comments:

Post a Comment