Automating Your Content Pipeline
Lesson 2 of 2
Building a Blog Sync Pipeline
Estimated time: 10 minutes
Building a Blog Sync Pipeline
Architecture
Four stages:
- Fetch: Pull articles from sources (Hackernoon, Medium, RSS)
- Normalize: Convert to a standard format
- Store: Save to database or static site
- Publish: Generate blog pages and deploy
Stage 1: Multi-Source Fetching
class ArticleFetcher {
async fetchFromHackernoon(username) {
// Use previous lesson's scraping logic
}
async fetchFromRSS(feedUrl) {
const Parser = require('rss-parser');
const parser = new Parser();
const feed = await parser.parseURL(feedUrl);
return feed.items.map(item => ({
title: item.title,
content: item.content,
url: item.link,
published_at: item.pubDate
}));
}
async fetchAllSources(config) {
const articles = [];
if (config.hackernoon) {
articles.push(...await this.fetchFromHackernoon(config.hackernoon.username));
}
if (config.rss) {
for (const feedUrl of config.rss) {
articles.push(...await this.fetchFromRSS(feedUrl));
}
}
return articles;
}
}
Stage 2: Normalize to Standard Format
class ArticleNormalizer {
normalize(article, source) {
return {
title: article.title,
slug: this.slugify(article.title),
content_markdown: article.content,
author: article.author || 'Unknown',
source: source,
source_url: article.url,
published_at: new Date(article.published_at),
tags: article.tags || [],
excerpt: this.generateExcerpt(article.content)
};
}
slugify(title) {
return title
.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-|-$/g, '');
}
generateExcerpt(content, words = 50) {
return content
.split(' ')
.slice(0, words)
.join(' ') + '...';
}
}
Stage 3: Deduplication & Storage
const sqlite3 = require('sqlite3');
class ArticleStore {
constructor(dbPath) {
this.db = new sqlite3.Database(dbPath);
this.init();
}
init() {
this.db.run(`
CREATE TABLE IF NOT EXISTS articles (
id INTEGER PRIMARY KEY,
slug TEXT UNIQUE,
title TEXT,
content TEXT,
source TEXT,
source_url TEXT UNIQUE,
published_at DATETIME,
synced_at DATETIME,
tags TEXT
)
`);
}
async exists(sourceUrl) {
return new Promise((resolve, reject) => {
this.db.get(
'SELECT id FROM articles WHERE source_url = ?',
[sourceUrl],
(err, row) => {
if (err) reject(err);
resolve(!!row);
}
);
});
}
async save(article) {
const isDuplicate = await this.exists(article.source_url);
if (isDuplicate) {
console.log(`Skipping duplicate: ${article.title}`);
return { status: 'duplicate' };
}
return new Promise((resolve, reject) => {
this.db.run(
`INSERT INTO articles
(slug, title, content, source, source_url, published_at, synced_at, tags)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)`,
[
article.slug,
article.title,
article.content_markdown,
article.source,
article.source_url,
article.published_at,
new Date(),
JSON.stringify(article.tags)
],
(err) => {
if (err) reject(err);
resolve({ status: 'saved' });
}
);
});
}
}
Stage 4: Blog Generation
const fs = require('fs').promises;
const path = require('path');
class BlogGenerator {
constructor(outputDir) {
this.outputDir = outputDir;
}
async generatePages(articles) {
const postsDir = path.join(this.outputDir, 'posts');
await fs.mkdir(postsDir, { recursive: true });
for (const article of articles) {
const content = this.renderArticle(article);
const filePath = path.join(postsDir, `${article.slug}.md`);
await fs.writeFile(filePath, content);
console.log(`Generated: ${filePath}`);
}
}
renderArticle(article) {
return `---
title: "${article.title}"
date: ${article.published_at}
author: ${article.author}
source: ${article.source}
source_url: ${article.source_url}
tags: [${article.tags.join(', ')}]
---
${article.content_markdown}`;
}
async generateIndex(articles) {
const index = articles
.sort((a, b) => b.published_at - a.published_at)
.map(a => `- [${a.title}](./posts/${a.slug}.md) (${a.source})`)
.join('\n');
const content = `# Blog Archive\n\n${index}`;
await fs.writeFile(path.join(this.outputDir, 'index.md'), content);
}
}
Full Pipeline
async function runPipeline() {
const config = {
hackernoon: { username: 'zbruceli' },
rss: ['https://example.com/feed.xml']
};
const fetcher = new ArticleFetcher();
const normalizer = new ArticleNormalizer();
const store = new ArticleStore('./blog.db');
const generator = new BlogGenerator('./output');
// Fetch
console.log('Fetching articles...');
const rawArticles = await fetcher.fetchAllSources(config);
// Normalize & Store
console.log(`Processing ${rawArticles.length} articles...`);
const stored = [];
for (const article of rawArticles) {
const normalized = normalizer.normalize(article);
const result = await store.save(normalized);
if (result.status === 'saved') {
stored.push(normalized);
}
}
// Generate
console.log(`Generating blog pages for ${stored.length} articles...`);
await generator.generatePages(stored);
await generator.generateIndex(stored);
console.log('Done!');
}
// Run on a schedule (e.g., daily at 6 AM)
const cron = require('node-cron');
cron.schedule('0 6 * * *', runPipeline);
Next: Deployment
Automatically push updated blog to GitHub Pages, Vercel, or Netlify after generation.