Automating Your Content Pipeline

Lesson 1 of 2

Scraping Hackernoon Articles Programmatically

Estimated time: 10 minutes

Scraping Hackernoon Articles Programmatically

Why Hackernoon?

Hackernoon is a popular tech publication with:

  • Millions of quality tech articles
  • Public user profiles
  • No authentication needed for public content
  • Consistent HTML structure

Understanding Hackernoon's Architecture

Hackernoon uses Next.js which embeds page data in __NEXT_DATA__ JSON objects.

Finding Articles

Profile pages list articles:

https://hackernoon.com/u/{username}

The page HTML contains a <script id="__NEXT_DATA__"> tag with JSON data.

Extraction Method

const fetch = require('node-fetch');
const cheerio = require('cheerio');

async function getHackernoonProfile(username) {
  const url = `https://hackernoon.com/u/${username}`;
  
  const res = await fetch(url, {
    headers: {
      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/91.0'
    }
  });
  
  const html = await res.text();
  const $ = cheerio.load(html);
  
  // Extract __NEXT_DATA__
  const nextDataScript = $('#__NEXT_DATA__').html();
  const nextData = JSON.parse(nextDataScript);
  
  return nextData;
}

Parsing Article List

function parseArticles(nextData) {
  const articles = [];
  
  // Navigate to the articles section
  const props = nextData.props.pageProps;
  const articles_list = props.user.articles || [];
  
  for (const article of articles_list) {
    articles.push({
      slug: article.slug,
      title: article.title,
      url: `https://hackernoon.com/${article.slug}`,
      created_at: article.created_at,
      reading_time: article.reading_time_minutes,
      tags: article.tags || []
    });
  }
  
  return articles;
}

const profileData = await getHackernoonProfile('zbruceli');
const articles = parseArticles(profileData);
console.log(articles);

Getting Full Article Content

async function getArticleContent(articleSlug) {
  const url = `https://hackernoon.com/${articleSlug}`;
  
  const res = await fetch(url, {
    headers: {
      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/91.0'
    }
  });
  
  const html = await res.text();
  const $ = cheerio.load(html);
  
  const nextDataScript = $('#__NEXT_DATA__').html();
  const nextData = JSON.parse(nextDataScript);
  
  const article = nextData.props.pageProps.article;
  
  return {
    title: article.title,
    content_html: article.articleBody,  // Full HTML content
    author: article.author.name,
    published_at: article.published_at,
    tags: article.tags
  };
}

Converting HTML to Markdown

const TurndownService = require('turndown');

function htmlToMarkdown(html) {
  const turndownService = new TurndownService();
  return turndownService.turndown(html);
}

const articleContent = await getArticleContent('ai-power-problem-planet');
const markdown = htmlToMarkdown(articleContent.content_html);
console.log(markdown);

Handling Rate Limits

Hackernoon may block aggressive scraping:

const pLimit = require('p-limit');
const limit = pLimit(1);  // 1 request at a time
const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms));

async function scrapeAllArticles(username) {
  const profile = await getHackernoonProfile(username);
  const articles = parseArticles(profile);
  
  const contents = await Promise.all(
    articles.map(article => 
      limit(async () => {
        await delay(2000);  // 2 second delay between requests
        return getArticleContent(article.slug);
      })
    )
  );
  
  return contents;
}

Extracting Metadata

function extractMetadata(articleData) {
  return {
    title: articleData.title,
    author: articleData.author,
    date: articleData.published_at,
    tags: articleData.tags,
    reading_time: articleData.reading_time_minutes,
    canonical_url: `https://hackernoon.com/${articleData.slug}`,
    thumbnail: articleData.thumbnail_url
  };
}

Next: Syncing to Your Blog

Once you have articles with metadata, push them to your personal blog database or static site generator.