#!/bin/bash
#grep 'yoga' domainnames > yoga_domainname.txt
get_first_image() {
local domain=$1
local title=$(curl -sL --max-time 10 $domain | grep -oP '
]*>\K[^<]+' | head -n 1)
#change to lowercase using bash
domain=${domain,,}
title=${title,,}
#create an array of adult content words
#check if the domain and title does not contain adult content
if [ -z "$title" ]; then
echo "skipping $domain - no titlle"
return
fi
if [[ $title =~ "bad" || $title =~ "words" ]]; then
echo "skipping $domain - ADULT"
#echo -e "Red='\033[0;31m'"
return
fi
#get the first image using curl and grep and head
#curl -s = silent, -L = follow redirects
#grep -o = only match, -P = perl regex do not use with E, (?<=
]*?src="\K[^"]+' | head -1
#curl -s "https://100women.org/" | grep -oP '
]*?src="\K[^"]+' | head -1
#local image=$(curl -sL --max-time 10 $domain | grep -oP '
]*?src="\K[^"]+' | head -n 1)
#get top two images
local image=$(curl -sL --max-time 10 $domain | grep -oP '
]*?src="\K[^"]+' | head -n 2)
if [ -z "$image" ]; then
echo " no images $domain"
return
fi
echo "imagees ====== $image"
##get the first line and store as first image
local first_image=$(echo $image | awk '{print $1}')
echo "first image>>>>>>> $first_image"
local second_image=$(echo $image | awk '{print $2}')
echo "second image >>>>>>> $second_image"
#check if relative path or absolute
if [[ $first_image =~ ^/ ]]; then
first_image="https://$domain$first_image"
fi
if [[ $second_image =~ ^/ ]]; then
second_image="https://$domain$second_image"
fi
#get basename
local first_image_base=$(basename $first_image)
local second_image_base=$(basename $second_image)
#append the domain name and title to a file, separate records by {{{{}}}}
echo "$domain{{{{}}}}$title{{{{}}}}$first_image_base{{{{}}}}$second_image_base" >> yoga_domainname_titles.txt
#download the image
#echo " download image $image"
#curl -s = silent, -L = follow redirects, --output = output to file
curl -sL --output "yoga_images/${domain}_first_image" $first_image
#resize image to 128x128
#convert myfigure.png -resize 128x128 myfigure.jpg
#MUST RESIZE AFTER
#convert "yoga_images/${domain}_first_image" -resize 256x256 "yoga_images/${domain}_first_image"
curl -sL --output "yoga_images/${domain}_second_image" $second_image
#resize image to 128x128
#convert myfigure.png -resize 128x128 myfigure.jpg
#MUST RESIZE AFTER
#convert "yoga_images/${domain}_second_image" -resize 256x256 "yoga_images/${domain}_second_image"
#echo "images $domain = ${image}"
#check if the image is not empty -z = empty
#if [ -z "$image" ]; then
# echo " no images $domain"
## return
#fi
#echo " got image $domain"
#check if relative path or absolute
#if [[ $image =~ ^/ ]]; then
# image="https://$domain$image"
#fi
#download the image
#echo " download image $image"
#curl -s = silent, -L = follow redirects, --output = output to file
#curl -sL --output "yoga_images/${domain}_image" $image
#resize image to 128x128
#convert myfigure.png -resize 128x128 myfigure.jpg
#convert "yoga_images/${domain}_image" -resize 256x256 "yoga_images/${domain}_image_icon"
unset domain
unset title
unset image
unset first_image
unset second_image
}
export -f get_first_image
parallel --bar --load 10% get_first_image {1} :::: yoga_domainname-partial.txt
# load 70% resource
# -colsep The columns in a file can be bound to positional replacement strings using --colsep.v
# command first get_favicon
# Instead of -a files can be given after ::::
# the argument from the file {1}, {2} etc , each line
# no space image=
#enclose variables in bash in braces not to be confused with parallel {}
#awk '{sub(/,/, "\t", $0); print $0}' cleanup41M.txt > cleanup41M-tab.txt
#means replace , with tab in the whole line