//npm install sharp //npm install puppeteer // (Ctrl+Shift+I) //curl -sL https://deb.nodesource.com/setup_18.x | sudo -E bash - //apt-get update //npm install -g npm@9.2.0 //const sharp = require('sharp'); const puppeteer = require('puppeteer'); // Require Puppeteer module const URL = require('url'); const Screenshot = async (url, website) => { // Define Screenshot function const hostname = URL.parse(url).hostname; let list_words = ['bad']; //resolve async if the hostname contains innappropriate words //check if hostname contains words from the list for (i = 0; i < list_words.length; i++) { if (hostname.indexOf(list_words[i]) >= 0) { console.log('bad word'); return Promise.resolve(1); } } const browser = await puppeteer.launch(); // Launch a "browser" const page = await browser.newPage(); // Open a new page await page.setViewport({ width: 800, height: 600, deviceScaleFactor: 1, }); /** await page.goto(url, { waitUntil: 'networkidle2', }); **/ // Go to the website // {waitUntil: 'load', timeout: 0} await page.goto(url, { waitUntil: 'load', timeout: 15000 }); await page.screenshot({ // Screenshot the website using defined options type: 'jpeg', path: "./screens/" + hostname + ".jpg", // Save the screenshot in current directory quality: 65, fullPage: false // take a fullpage screenshot }); await page.close(); // Close the website await browser.close(); // Close the browser } const fs = require('fs'); const { hostname } = require('os'); const cheeks = async function () { try { const data = fs.readFileSync('raw.txt', 'utf8'); data_lines = data.split('\n'); //foreach is not async function let last_line_number = 0; //write the last line number to a file //check if the file exists if (fs.existsSync('last_line.txt')) { console.log("file exists"); } else { console.log("file does not exist"); fs.writeFile('last_line.txt', '0', function (err) { if (err) throw err; console.log('Saved!'); }); } try { const data = fs.readFileSync('last_line.txt', 'utf8'); last_line_number = parseInt(data); console.log("last line number is " + last_line_number); } catch (err) { console.error(err); } let lines_remaining = data_lines.length - last_line_number; let space_required = (data_lines.length - last_line_number) * 0.2; console.log('lines remaining: ' + (data_lines.length - last_line_number)); console.log('space required: ' + (data_lines.length - last_line_number) * 0.2 + 'MB'); let previous_domain = ''; for (let i = last_line_number; i < data_lines.length - 1; i++) { website = data_lines[i]; let hostname = URL.parse(website).hostname; if (hostname == previous_domain) { console.log("same domain"); continue; } previous_domain = hostname; console.log("going " + i + ' ' + website, ' ', space_required, ' ', lines_remaining); try { await Screenshot(website, website); //await dont continue until the function is done //resize image using sharp module //const sharp = require('sharp'); /** if(fs.existsSync('./screens/'+hostname+'.jpg') ){ sharp('./screens/'+hostname+'.jpg').resize(256, 256).toFile('./screens/'+hostname+'-thumb.jpg'); //delete the original image console.log("file exists"); fs.unlink('./screens/'+hostname+'.jpg', (err) => { if (err) { console.error(err) } return }); }**/ //convert i to string } catch (err) { console.error(err); } fs.writeFile('last_line.txt', i.toString(), function (err) { if (err) { throw err; } else { console.log('Saved!'); } }); } } catch (err) { console.error(err); } } cheeks();