Building a Blog Sync Pipeline

Architecture

Four stages:

Fetch: Pull articles from sources (Hackernoon, Medium, RSS)
Normalize: Convert to a standard format
Store: Save to database or static site
Publish: Generate blog pages and deploy

Stage 1: Multi-Source Fetching

class ArticleFetcher {
  async fetchFromHackernoon(username) {
    // Use previous lesson's scraping logic
  }

  async fetchFromRSS(feedUrl) {
    const Parser = require('rss-parser');
    const parser = new Parser();
    const feed = await parser.parseURL(feedUrl);
    
    return feed.items.map(item => ({
      title: item.title,
      content: item.content,
      url: item.link,
      published_at: item.pubDate
    }));
  }

  async fetchAllSources(config) {
    const articles = [];
    
    if (config.hackernoon) {
      articles.push(...await this.fetchFromHackernoon(config.hackernoon.username));
    }
    
    if (config.rss) {
      for (const feedUrl of config.rss) {
        articles.push(...await this.fetchFromRSS(feedUrl));
      }
    }
    
    return articles;
  }
}

Stage 2: Normalize to Standard Format

class ArticleNormalizer {
  normalize(article, source) {
    return {
      title: article.title,
      slug: this.slugify(article.title),
      content_markdown: article.content,
      author: article.author || 'Unknown',
      source: source,
      source_url: article.url,
      published_at: new Date(article.published_at),
      tags: article.tags || [],
      excerpt: this.generateExcerpt(article.content)
    };
  }

  slugify(title) {
    return title
      .toLowerCase()
      .replace(/[^a-z0-9]+/g, '-')
      .replace(/^-|-$/g, '');
  }

  generateExcerpt(content, words = 50) {
    return content
      .split(' ')
      .slice(0, words)
      .join(' ') + '...';
  }
}

Stage 3: Deduplication & Storage

const sqlite3 = require('sqlite3');

class ArticleStore {
  constructor(dbPath) {
    this.db = new sqlite3.Database(dbPath);
    this.init();
  }

  init() {
    this.db.run(`
      CREATE TABLE IF NOT EXISTS articles (
        id INTEGER PRIMARY KEY,
        slug TEXT UNIQUE,
        title TEXT,
        content TEXT,
        source TEXT,
        source_url TEXT UNIQUE,
        published_at DATETIME,
        synced_at DATETIME,
        tags TEXT
      )
    `);
  }

  async exists(sourceUrl) {
    return new Promise((resolve, reject) => {
      this.db.get(
        'SELECT id FROM articles WHERE source_url = ?',
        [sourceUrl],
        (err, row) => {
          if (err) reject(err);
          resolve(!!row);
        }
      );
    });
  }

  async save(article) {
    const isDuplicate = await this.exists(article.source_url);
    
    if (isDuplicate) {
      console.log(`Skipping duplicate: ${article.title}`);
      return { status: 'duplicate' };
    }

    return new Promise((resolve, reject) => {
      this.db.run(
        `INSERT INTO articles 
         (slug, title, content, source, source_url, published_at, synced_at, tags)
         VALUES (?, ?, ?, ?, ?, ?, ?, ?)`,
        [
          article.slug,
          article.title,
          article.content_markdown,
          article.source,
          article.source_url,
          article.published_at,
          new Date(),
          JSON.stringify(article.tags)
        ],
        (err) => {
          if (err) reject(err);
          resolve({ status: 'saved' });
        }
      );
    });
  }
}

Stage 4: Blog Generation

const fs = require('fs').promises;
const path = require('path');

class BlogGenerator {
  constructor(outputDir) {
    this.outputDir = outputDir;
  }

  async generatePages(articles) {
    const postsDir = path.join(this.outputDir, 'posts');
    await fs.mkdir(postsDir, { recursive: true });

    for (const article of articles) {
      const content = this.renderArticle(article);
      const filePath = path.join(postsDir, `${article.slug}.md`);
      await fs.writeFile(filePath, content);
      console.log(`Generated: ${filePath}`);
    }
  }

  renderArticle(article) {
    return `---
title: "${article.title}"
date: ${article.published_at}
author: ${article.author}
source: ${article.source}
source_url: ${article.source_url}
tags: [${article.tags.join(', ')}]
---

${article.content_markdown}`;
  }

  async generateIndex(articles) {
    const index = articles
      .sort((a, b) => b.published_at - a.published_at)
      .map(a => `- [${a.title}](./posts/${a.slug}.md) (${a.source})`)
      .join('\n');

    const content = `# Blog Archive\n\n${index}`;
    await fs.writeFile(path.join(this.outputDir, 'index.md'), content);
  }
}

Full Pipeline

async function runPipeline() {
  const config = {
    hackernoon: { username: 'zbruceli' },
    rss: ['https://example.com/feed.xml']
  };

  const fetcher = new ArticleFetcher();
  const normalizer = new ArticleNormalizer();
  const store = new ArticleStore('./blog.db');
  const generator = new BlogGenerator('./output');

  // Fetch
  console.log('Fetching articles...');
  const rawArticles = await fetcher.fetchAllSources(config);

  // Normalize & Store
  console.log(`Processing ${rawArticles.length} articles...`);
  const stored = [];
  
  for (const article of rawArticles) {
    const normalized = normalizer.normalize(article);
    const result = await store.save(normalized);
    
    if (result.status === 'saved') {
      stored.push(normalized);
    }
  }

  // Generate
  console.log(`Generating blog pages for ${stored.length} articles...`);
  await generator.generatePages(stored);
  await generator.generateIndex(stored);
  
  console.log('Done!');
}

// Run on a schedule (e.g., daily at 6 AM)
const cron = require('node-cron');
cron.schedule('0 6 * * *', runPipeline);

Next: Deployment

Automatically push updated blog to GitHub Pages, Vercel, or Netlify after generation.