Skip to content
Advertisement

Node.js/Axios/Cheerio Web Scraping – issue with Promises

I have an issue with part of my web scraping program. The return res.send(statsArray) line in index.js always returns an empty array on the initial run (using npm start), and will only return a properly filled array after at least one refresh.

Here is relevant index.html (if needed):

<!DOCTYPE html>
<html>
    <head>
        <meta charset="utf-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge">
        <title>Web Scraping App</title>
        <meta name="description" content="">
        <meta name="viewport" content="width=device-width, initial-scale=1">
        <link rel="stylesheet" href="src/styles.css">
    </head>
    <body>
        <script src="src/app.js" async defer></script>
    </body>
</html>

Here is my app.js, the file linked by index.html:

//Get players names, links, and stats
fetch('http://localhost:8000/players')
    .then(response => {return response.json()})
    .then(data => {
        console.log(data)
             
}).catch(err=>console.log(err))

Here is the relevant part of my index.js:

const PORT = 8000

const axios = require('axios')
const cheerio = require('cheerio')
const express = require('express')
const cors = require('cors')

const app = express()
app.use(cors())


app.listen(PORT , () => console.log(`server running on PORT ${PORT}`))

const players = 'https://www.trinethunder.com/sports/sball/2021-22/teams/trine?view=roster'
const playerStats = 'https://www.trinethunder.com'
const playerName = []
const playerLink = []


app.get('/players', (req, res) => {
    function getPlayers(){
        return new Promise((resolve, reject) => {
            axios(players)
            .then(response => {
                const html = response.data
                const $ = cheerio.load(html)
                //const error = false;
                $('td.text.pinned-col > a', html).each(function(){
                    var player = $(this).text()
                    player = player.replace(/ss+/g, ' ').trim();
                    //if name not yet in array, push to array
                    if(playerName.indexOf(player) === -1){
                        playerName.push(player)
                    } 
                })
                $('td.text.pinned-col > a', html).each(function(){
                    var link = $(this).attr('href')
                    //if link not yet in array, push to array
                    if(playerLink.indexOf(playerStats+link) === -1){
                        playerLink.push(playerStats+link)
                    } 
                })
                console.log(playerLink)
                
                /*if (!error){
                    resolve()
                } else {
                    reject('Error: something went wrong')
                }*/

            })
        })
    }
    function getPlayerStats(){
        setTimeout(()=>{
            for(let i=0; i<playerLink.length; i++){
                axios.get(playerLink[i])
                .then(response => {
                    const html = response.data
                    const $ = cheerio.load(html)
                    const statName = []
                    const statDesc = []
                    const statNum = []

                    $('h2 > span:nth-child(1)', html).each(function(){
                        var name = $(this).text()
                        statName.push(name)
                    })
                    $('.stat-title', html).each(function(){
                        var stat1 = $(this).text()
                        statDesc.push(stat1)
                    })
                    $('.stat-value', html).each(function(){
                        var stat2 = $(this).text()
                        statNum.push(stat2)
                    })
                    //Conditional is here because sometimes statsArray 
                    //gets filled multiple times 
                    if(statsArray.length <63){
                    statsArray.push(statName, statDesc, statNum)
                    }
                    
                }).catch(err => console.log(err))
            }
            return res.send(statsArray)
        }, 3000)
    }

getPlayers()
.then(getPlayerStats())
.catch(err => console.log(err))
})

I have been trying to figure out how to loop through each url; using Promise.all, return new promise, async/await keywords, and so on. This approach has gotten me the closest to my desired outcome, but if there’s a better way to do this please let me know.

I just need to be able to get the result on the first try. There must be something wrong with the way I’m using Promise; all the other data I scrape is returned without refreshing, and they don’t use promises.

Thanks for any help!

Answer

I was unable to pull the data from the “https://www.trinethunder.com” site, because I get a 403 error from my IP, but in theory these corrections in the code should help. The only thing I don’t understand is why you get playerName array if you don’t use them.

const PORT = 8000;

const axios = require("axios");
const cheerio = require("cheerio");
const express = require("express");
const cors = require("cors");

const app = express();
app.use(cors());

app.listen(PORT, () => console.log(`server running on PORT ${PORT}`));

const players = "https://www.trinethunder.com/sports/sball/2021-22/teams/trine?view=roster";
const playerStats = "https://www.trinethunder.com";

app.get("/players", (req, res) => {
  function getPlayers() {
    return new Promise((resolve, reject) => {
      axios(players)
        .then((response) => {
          const playerName = [];
          const playerLink = [];
          const html = response.data;
          const $ = cheerio.load(html);
          //const error = false;
          $("td.text.pinned-col > a", html).each(function () {
            var player = $(this).text();
            player = player.replace(/ss+/g, " ").trim();
            //if name not yet in array, push to array
            if (playerName.indexOf(player) === -1) {
              playerName.push(player);
            }
          });
          $("td.text.pinned-col > a", html).each(function () {
            var link = $(this).attr("href");
            //if link not yet in array, push to array
            if (playerLink.indexOf(playerStats + link) === -1) {
              playerLink.push(playerStats + link);
            }
          });

          resolve(playerLink);
        })
        .catch((err) => {
          console.log(err);
        });
    });
  }
  function getPlayerStats(playerLink) {
    const statsArray = [];
    setTimeout(async () => {
      for (let i = 0; i < playerLink.length; i++) {
        await new Promise((resolve, reject) => {
          axios
            .get(playerLink[i])
            .then((response) => {
              const html = response.data;
              const $ = cheerio.load(html);
              const statName = [];
              const statDesc = [];
              const statNum = [];

              $("h2 > span:nth-child(1)", html).each(function () {
                var name = $(this).text();
                statName.push(name);
              });
              $(".stat-title", html).each(function () {
                var stat1 = $(this).text();
                statDesc.push(stat1);
              });
              $(".stat-value", html).each(function () {
                var stat2 = $(this).text();
                statNum.push(stat2);
              });
              //Conditional is here because sometimes statsArray
              //gets filled multiple times
              if (statsArray.length < 63) {
                statsArray.push(statName, statDesc, statNum);
              }
              resolve();
            })
            .catch((err) => console.log(err));
        });
      }
      return res.send(JSON.stringify(statsArray));
    }, 3000);
  }

  getPlayers()
    .then(getPlayerStats)
    .catch((err) => console.log(err));
});
Advertisement