Let's write a function to iterate recursively through the links of a website using Puppeteer:
javascriptconst exploreLinks = async (startURL) => { // Launching Puppeteer const browser = await puppeteer.launch({headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox']}); // Opening a new page const page = await browser.newPage(); // Going to start URL and waiting until loaded await page.goto(startURL, {waitUntil: 'networkidle0'}); // Getting the links of the startURL and filtering only links with the same root as startURL let urlsToVisit = await page.$$eval('a', links => links.map(link => link.href).filter(link => link.startsWith(startURL))); // Marking startURL as visited let urlsVisited = {[startURL]: true}; // Iterating through the rest of the webpages for(let i = 0; i < urlsToVisit.length; i ++){ // Getting next URL to explore let url = urlsToVisit[i]; if(!urlsVisited[url]){ // Going to next URL and waiting until loaded await page.goto(url, {waitUntil: 'networkidle0'}); // Getting urls again let _urlsToVisit = await page.$$eval('a', links => links.map(link => link.href).filter(link => link.startsWith(startURL))); // Adding to the array of pending URLs to visit but removing duplicates urlsToVisit = [...new Set([...urlsToVisit, ..._urlsToVisit])]; // Marking URL as visited urlsVisited[url] = true; } } return urlsVisited; }
Now you can start the function:
javascriptexploreLinks('https://erikmartinjordan.com');
Hi, I'm Erik, an engineer from Barcelona. If you like the post or have any comments, say hi.