Complete Implementation
Python
import requests, csv, json, hashlib
from datetime import datetime
from pathlib import Path
from bs4 import BeautifulSoup
from dataclasses import dataclass, asdict
@dataclass
class Article:
title: str
url: str
source: str
scraped_at: str = ""
def __post_init__(self):
if not self.scraped_at:
self.scraped_at = datetime.now().isoformat()
@property
def id(self):
return hashlib.md5(self.url.encode()).hexdigest()[:8]
SOURCES = {
"Hacker News": {
"url": "https://news.ycombinator.com",
"selector": ".storylink",
"attr": "href",
},
}
def scrape(name, config):
try:
r = requests.get(config["url"], headers={"User-Agent": "NewsBot/1.0"}, timeout=15)
r.raise_for_status()
soup = BeautifulSoup(r.text, "lxml")
return [
Article(title=a.get_text(strip=True), url=a.get(config["attr"], ""), source=name)
for a in soup.select(config["selector"])[:20]
if a.get_text(strip=True)
]
except Exception as e:
print(f"Failed {name}: {e}"); return []
def deduplicate(articles):
seen, unique = set(), []
for a in articles:
if a.id not in seen:
seen.add(a.id); unique.append(a)
return unique
all_articles = []
for name, config in SOURCES.items():
arts = scrape(name, config)
all_articles.extend(arts)
print(f"{name}: {len(arts)} articles")
unique = deduplicate(all_articles)
out = Path("news_data"); out.mkdir(exist_ok=True)
date = datetime.now().strftime("%Y%m%d")
Path(out / f"news_{date}.json").write_text(json.dumps([asdict(a) for a in unique], indent=2))
with open(out / f"news_{date}.csv", "w", newline="") as f:
w = csv.DictWriter(f, fieldnames=["title","url","source","scraped_at"])
w.writeheader(); w.writerows([asdict(a) for a in unique])
print(f"\nSaved {len(unique)} articles to {out}/")
for i, a in enumerate(unique[:5], 1):
print(f" {i}. [{a.source}] {a.title}")Bash
pip install requests beautifulsoup4 lxml
python scraper.pyTip: Schedule with cron to run daily and build a historical news archive.