Automating Your Content Pipeline
Lesson 1 of 2
Scraping Hackernoon Articles Programmatically
Estimated time: 10 minutes
Scraping Hackernoon Articles Programmatically
Why Hackernoon?
Hackernoon is a popular tech publication with:
- Millions of quality tech articles
- Public user profiles
- No authentication needed for public content
- Consistent HTML structure
Understanding Hackernoon's Architecture
Hackernoon uses Next.js which embeds page data in __NEXT_DATA__ JSON objects.
Finding Articles
Profile pages list articles:
https://hackernoon.com/u/{username}
The page HTML contains a <script id="__NEXT_DATA__"> tag with JSON data.
Extraction Method
const fetch = require('node-fetch');
const cheerio = require('cheerio');
async function getHackernoonProfile(username) {
const url = `https://hackernoon.com/u/${username}`;
const res = await fetch(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/91.0'
}
});
const html = await res.text();
const $ = cheerio.load(html);
// Extract __NEXT_DATA__
const nextDataScript = $('#__NEXT_DATA__').html();
const nextData = JSON.parse(nextDataScript);
return nextData;
}
Parsing Article List
function parseArticles(nextData) {
const articles = [];
// Navigate to the articles section
const props = nextData.props.pageProps;
const articles_list = props.user.articles || [];
for (const article of articles_list) {
articles.push({
slug: article.slug,
title: article.title,
url: `https://hackernoon.com/${article.slug}`,
created_at: article.created_at,
reading_time: article.reading_time_minutes,
tags: article.tags || []
});
}
return articles;
}
const profileData = await getHackernoonProfile('zbruceli');
const articles = parseArticles(profileData);
console.log(articles);
Getting Full Article Content
async function getArticleContent(articleSlug) {
const url = `https://hackernoon.com/${articleSlug}`;
const res = await fetch(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/91.0'
}
});
const html = await res.text();
const $ = cheerio.load(html);
const nextDataScript = $('#__NEXT_DATA__').html();
const nextData = JSON.parse(nextDataScript);
const article = nextData.props.pageProps.article;
return {
title: article.title,
content_html: article.articleBody, // Full HTML content
author: article.author.name,
published_at: article.published_at,
tags: article.tags
};
}
Converting HTML to Markdown
const TurndownService = require('turndown');
function htmlToMarkdown(html) {
const turndownService = new TurndownService();
return turndownService.turndown(html);
}
const articleContent = await getArticleContent('ai-power-problem-planet');
const markdown = htmlToMarkdown(articleContent.content_html);
console.log(markdown);
Handling Rate Limits
Hackernoon may block aggressive scraping:
const pLimit = require('p-limit');
const limit = pLimit(1); // 1 request at a time
const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms));
async function scrapeAllArticles(username) {
const profile = await getHackernoonProfile(username);
const articles = parseArticles(profile);
const contents = await Promise.all(
articles.map(article =>
limit(async () => {
await delay(2000); // 2 second delay between requests
return getArticleContent(article.slug);
})
)
);
return contents;
}
Extracting Metadata
function extractMetadata(articleData) {
return {
title: articleData.title,
author: articleData.author,
date: articleData.published_at,
tags: articleData.tags,
reading_time: articleData.reading_time_minutes,
canonical_url: `https://hackernoon.com/${articleData.slug}`,
thumbnail: articleData.thumbnail_url
};
}
Next: Syncing to Your Blog
Once you have articles with metadata, push them to your personal blog database or static site generator.