#!/bin/bash #grep 'yoga' domainnames > yoga_domainname.txt get_first_image() { local domain=$1 local title=$(curl -sL --max-time 10 $domain | grep -oP ']*>\K[^<]+' | head -n 1) #change to lowercase using bash domain=${domain,,} title=${title,,} #create an array of adult content words #check if the domain and title does not contain adult content if [ -z "$title" ]; then echo "skipping $domain - no titlle" return fi if [[ $title =~ "bad" || $title =~ "words" ]]; then echo "skipping $domain - ADULT" #echo -e "Red='\033[0;31m'" return fi #get the first image using curl and grep and head #curl -s = silent, -L = follow redirects #grep -o = only match, -P = perl regex do not use with E, (?<=]*?src="\K[^"]+' | head -1 #curl -s "https://100women.org/" | grep -oP ']*?src="\K[^"]+' | head -1 #local image=$(curl -sL --max-time 10 $domain | grep -oP ']*?src="\K[^"]+' | head -n 1) #get top two images local image=$(curl -sL --max-time 10 $domain | grep -oP ']*?src="\K[^"]+' | head -n 2) if [ -z "$image" ]; then echo " no images $domain" return fi echo "imagees ====== $image" ##get the first line and store as first image local first_image=$(echo $image | awk '{print $1}') echo "first image>>>>>>> $first_image" local second_image=$(echo $image | awk '{print $2}') echo "second image >>>>>>> $second_image" #check if relative path or absolute if [[ $first_image =~ ^/ ]]; then first_image="https://$domain$first_image" fi if [[ $second_image =~ ^/ ]]; then second_image="https://$domain$second_image" fi #get basename local first_image_base=$(basename $first_image) local second_image_base=$(basename $second_image) #append the domain name and title to a file, separate records by {{{{}}}} echo "$domain{{{{}}}}$title{{{{}}}}$first_image_base{{{{}}}}$second_image_base" >> yoga_domainname_titles.txt #download the image #echo " download image $image" #curl -s = silent, -L = follow redirects, --output = output to file curl -sL --output "yoga_images/${domain}_first_image" $first_image #resize image to 128x128 #convert myfigure.png -resize 128x128 myfigure.jpg #MUST RESIZE AFTER #convert "yoga_images/${domain}_first_image" -resize 256x256 "yoga_images/${domain}_first_image" curl -sL --output "yoga_images/${domain}_second_image" $second_image #resize image to 128x128 #convert myfigure.png -resize 128x128 myfigure.jpg #MUST RESIZE AFTER #convert "yoga_images/${domain}_second_image" -resize 256x256 "yoga_images/${domain}_second_image" #echo "images $domain = ${image}" #check if the image is not empty -z = empty #if [ -z "$image" ]; then # echo " no images $domain" ## return #fi #echo " got image $domain" #check if relative path or absolute #if [[ $image =~ ^/ ]]; then # image="https://$domain$image" #fi #download the image #echo " download image $image" #curl -s = silent, -L = follow redirects, --output = output to file #curl -sL --output "yoga_images/${domain}_image" $image #resize image to 128x128 #convert myfigure.png -resize 128x128 myfigure.jpg #convert "yoga_images/${domain}_image" -resize 256x256 "yoga_images/${domain}_image_icon" unset domain unset title unset image unset first_image unset second_image } export -f get_first_image parallel --bar --load 10% get_first_image {1} :::: yoga_domainname-partial.txt # load 70% resource # -colsep The columns in a file can be bound to positional replacement strings using --colsep.v # command first get_favicon # Instead of -a files can be given after :::: # the argument from the file {1}, {2} etc , each line # no space image= #enclose variables in bash in braces not to be confused with parallel {} #awk '{sub(/,/, "\t", $0); print $0}' cleanup41M.txt > cleanup41M-tab.txt #means replace , with tab in the whole line