Python Web Scraper Project – Build a News Aggregator

Complete Implementation

Python

import requests, csv, json, hashlib
from datetime import datetime
from pathlib import Path
from bs4 import BeautifulSoup
from dataclasses import dataclass, asdict

@dataclass
class Article:
    title: str
    url: str
    source: str
    scraped_at: str = ""
    def __post_init__(self):
        if not self.scraped_at:
            self.scraped_at = datetime.now().isoformat()
    @property
    def id(self):
        return hashlib.md5(self.url.encode()).hexdigest()[:8]

SOURCES = {
    "Hacker News": {
        "url": "https://news.ycombinator.com",
        "selector": ".storylink",
        "attr": "href",
    },
}

def scrape(name, config):
    try:
        r = requests.get(config["url"], headers={"User-Agent": "NewsBot/1.0"}, timeout=15)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "lxml")
        return [
            Article(title=a.get_text(strip=True), url=a.get(config["attr"], ""), source=name)
            for a in soup.select(config["selector"])[:20]
            if a.get_text(strip=True)
        ]
    except Exception as e:
        print(f"Failed {name}: {e}"); return []

def deduplicate(articles):
    seen, unique = set(), []
    for a in articles:
        if a.id not in seen:
            seen.add(a.id); unique.append(a)
    return unique

all_articles = []
for name, config in SOURCES.items():
    arts = scrape(name, config)
    all_articles.extend(arts)
    print(f"{name}: {len(arts)} articles")

unique = deduplicate(all_articles)
out = Path("news_data"); out.mkdir(exist_ok=True)
date = datetime.now().strftime("%Y%m%d")

Path(out / f"news_{date}.json").write_text(json.dumps([asdict(a) for a in unique], indent=2))
with open(out / f"news_{date}.csv", "w", newline="") as f:
    w = csv.DictWriter(f, fieldnames=["title","url","source","scraped_at"])
    w.writeheader(); w.writerows([asdict(a) for a in unique])

print(f"\nSaved {len(unique)} articles to {out}/")
for i, a in enumerate(unique[:5], 1):
    print(f"  {i}. [{a.source}] {a.title}")

Bash

pip install requests beautifulsoup4 lxml
python scraper.py

Tip: Schedule with cron to run daily and build a historical news archive.