Setup
Bash
pip install requests beautifulsoup4 lxmlBasic Page Fetch
Python
import requests
from bs4 import BeautifulSoup
url = "https://example.com"
response = requests.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "lxml")
print(soup.title.string)
for link in soup.find_all("a", href=True):
print(link["href"], link.get_text(strip=True))CSS Selectors
Python
headings = soup.select("h2.article-title")
first_para = soup.select_one("div.content > p")
images = soup.find_all("img", attrs={"class": "thumbnail"})
for img in images:
print(img["src"], img.get("alt", ""))Handling Pagination
Python
import time
def scrape_all_pages(base_url, max_pages=10):
results = []
for page in range(1, max_pages + 1):
resp = requests.get(base_url, params={"page": page}, timeout=10)
if resp.status_code != 200:
break
soup = BeautifulSoup(resp.text, "lxml")
items = soup.select(".item-card")
if not items:
break
for item in items:
results.append({
"title": item.select_one("h3").get_text(strip=True),
"price": item.select_one(".price").get_text(strip=True),
})
time.sleep(1)
return resultsSaving Results
Python
import json, csv
data = [{"name": "Alice", "score": 95}]
with open("results.json", "w") as f:
json.dump(data, f, indent=2)
with open("results.csv", "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=["name", "score"])
writer.writeheader()
writer.writerows(data)Note: Always check a site's robots.txt and Terms of Service before scraping. Add delays between requests to avoid overloading servers.