183. Web Scraping

Snippet 1: Basic Web Scraping with BeautifulSoup

from bs4 import BeautifulSoup
import requests

url = "https://example.com"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

title = soup.find("title").text
print(f"Page Title: {title}")

Snippet 2: Extracting All Links from a Web Page

from bs4 import BeautifulSoup
import requests

url = "https://example.com"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

links = [a['href'] for a in soup.find_all('a', href=True)]
print("Links found:", links)

Snippet 3: Scraping Table Data with BeautifulSoup

from bs4 import BeautifulSoup
import requests

url = "https://example.com/table"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

table = soup.find("table")
rows = table.find_all("tr")
for row in rows:
    cells = row.find_all("td")
    print([cell.text.strip() for cell in cells])

Snippet 4: Using Scrapy Shell to Inspect a Web Page

scrapy shell "https://example.com"

In the Scrapy shell:

response.css('title::text').get()  # Get the page title
response.css('a::attr(href)').getall()  # Get all links

Snippet 5: Basic Scraper with Scrapy

import scrapy

class ExampleSpider(scrapy.Spider):
    name = "example"
    start_urls = ["https://example.com"]

    def parse(self, response):
        title = response.css("title::text").get()
        print(f"Page Title: {title}")

Snippet 6: Scraping Data into a JSON File with Scrapy

import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = ["http://quotes.toscrape.com"]

    def parse(self, response):
        for quote in response.css("div.quote"):
            yield {
                "text": quote.css("span.text::text").get(),
                "author": quote.css("span small.author::text").get(),
            }

# Save data by running: scrapy crawl quotes -o quotes.json

Snippet 7: Scraping Images with BeautifulSoup

from bs4 import BeautifulSoup
import requests
import os

url = "https://example.com"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

images = [img['src'] for img in soup.find_all("img", src=True)]
os.makedirs("images", exist_ok=True)

for i, img_url in enumerate(images):
    img_data = requests.get(img_url).content
    with open(f"images/image_{i}.jpg", "wb") as img_file:
        img_file.write(img_data)

Snippet 8: Handling Pagination with Scrapy

import scrapy

class PaginationSpider(scrapy.Spider):
    name = "pagination"
    start_urls = ["http://quotes.toscrape.com/page/1/"]

    def parse(self, response):
        for quote in response.css("div.quote"):
            yield {
                "text": quote.css("span.text::text").get(),
                "author": quote.css("span small.author::text").get(),
            }

        next_page = response.css("li.next a::attr(href)").get()
        if next_page:
            yield response.follow(next_page, self.parse)

Snippet 9: Extracting Metadata with BeautifulSoup

from bs4 import BeautifulSoup
import requests

url = "https://example.com"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

metadata = {meta["name"]: meta["content"] for meta in soup.find_all("meta", attrs={"name": True, "content": True})}
print("Metadata:", metadata)

Snippet 10: Scraping JSON Data with Requests

import requests

url = "https://api.example.com/data"
response = requests.get(url)
data = response.json()

for item in data:
    print(item["name"], item["value"])

Previous182. Metaclasses Next184. API Development

Last updated 2 months ago