Usa Shifter con Beautiful Soup
Combina los proxies residenciales y de ISP de Shifter con Beautiful Soup para un scraping en Python limpio y expresivo. Beautiful Soup gestiona el análisis de HTML, Shifter gestiona las IPs residenciales: no se necesita navegador sin interfaz gráfica.
Inicio rápido
Instalar
pip install beautifulsoup4 requests lxml Uso básico
import requests
from bs4 import BeautifulSoup
proxy_url = "customer-USERNAME-country-us-sid-123ABC:PASSWORD@p.shifter.io:443"
proxies = {"http": proxy_url, "https": proxy_url}
response = requests.get("https://example.com", proxies=proxies, timeout=30)
soup = BeautifulSoup(response.text, "lxml")
print(soup.title.string)
for article in soup.select("article.post"):
print(article.h2.text.strip(), "->", article.a["href"]) Características
Ejemplos
Sesión persistente + rastreo multipágina
Fija una IP residencial durante todo un rastreo de paginación añadiendo `sid-XXX` al nombre de usuario del proxy. Añade `country-uk` y `city-london` para segmentación geográfica.
import requests
import secrets
from bs4 import BeautifulSoup
from urllib.parse import urljoin
sid = secrets.token_hex(4)
proxy_url = (
f"customer-USERNAME-country-uk-city-london-sid-{sid}-ttl-300:"
f"PASSWORD@p.shifter.io:443"
)
# Use a session so connection pooling and cookies persist across requests.
session = requests.Session()
session.proxies = {"http": proxy_url, "https": proxy_url}
session.headers.update({
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
"Accept-Language": "en-GB,en;q=0.9",
})
products = []
url = "https://example.co.uk/products"
while url:
response = session.get(url, timeout=30)
soup = BeautifulSoup(response.text, "lxml")
for card in soup.select(".product-card"):
products.append({
"title": card.select_one("h2").text.strip(),
"price": card.select_one(".price").text.strip(),
"url": urljoin(url, card.select_one("a")["href"]),
})
next_link = soup.select_one("a.next-page")
url = urljoin(url, next_link["href"]) if next_link else None
print(f"Scraped {len(products)} products") Scraping en paralelo con concurrent.futures
Elimina el sid para rotación por solicitud. ThreadPoolExecutor + requests + Shifter escala a decenas de solicitudes concurrentes sin activar los límites de velocidad por IP.
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
# No sid -> every request gets a different residential IP.
PROXY_URL = "customer-USERNAME-country-us:PASSWORD@p.shifter.io:443"
def scrape(url: str) -> dict:
response = requests.get(
url,
proxies={"http": PROXY_URL, "https": PROXY_URL},
headers={"User-Agent": "Mozilla/5.0 AppleWebKit/537.36"},
timeout=30,
)
soup = BeautifulSoup(response.text, "lxml")
return {
"url": url,
"title": (soup.title.string or "").strip(),
"h1": [h.text.strip() for h in soup.select("h1")],
"links": [a["href"] for a in soup.select("a[href]")[:20]],
}
urls = [
"https://example.com/category/laptops",
"https://example.com/category/phones",
"https://example.com/category/tablets",
"https://example.com/category/wearables",
# ... hundreds more
]
with ThreadPoolExecutor(max_workers=16) as pool:
futures = {pool.submit(scrape, u): u for u in urls}
for f in as_completed(futures):
try:
result = f.result()
print(result["url"], "->", result["title"])
except Exception as exc:
print("error:", futures[f], exc) Rastreo robusto con reintentos y retroceso exponencial
El scraping en producción necesita reintentos ante errores 5xx y de conexión. Combina urllib3 Retry con Shifter y un sid nuevo por intento para superar bloqueos transitorios.
import requests
import secrets
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
class ShifterClient:
"""requests.Session that rotates the residential IP on retry."""
def __init__(self, country="us"):
self.country = country
self._session = requests.Session()
retry = Retry(
total=5,
backoff_factor=1.5,
status_forcelist=[429, 500, 502, 503, 504],
allowed_methods=["GET", "POST", "HEAD"],
)
adapter = HTTPAdapter(max_retries=retry, pool_connections=20)
self._session.mount("http://", adapter)
self._session.mount("https://", adapter)
def _proxy(self) -> str:
sid = secrets.token_hex(4)
return (
f"customer-USERNAME-country-{self.country}-sid-{sid}:"
f"PASSWORD@p.shifter.io:443"
)
def get(self, url: str, **kwargs) -> requests.Response:
return self._session.get(
url,
proxies={"http": self._proxy(), "https": self._proxy()},
timeout=kwargs.pop("timeout", 30),
**kwargs,
)
client = ShifterClient(country="de")
response = client.get("https://example.de/products")
soup = BeautifulSoup(response.text, "lxml")
for product in soup.select(".product"):
print(product.h2.text.strip(), product.select_one(".price").text.strip()) httpx (async) + Beautiful Soup
Si necesitas expansión asíncrona para miles de páginas, sustituye requests por httpx. La misma URL de Shifter, async/await nativo y compatibilidad total con Beautiful Soup.
# pip install httpx beautifulsoup4 lxml
import asyncio
import httpx
from bs4 import BeautifulSoup
PROXY = "customer-USERNAME-country-fr-sid-789GHI:PASSWORD@p.shifter.io:443"
async def fetch(client: httpx.AsyncClient, url: str) -> dict:
resp = await client.get(url, timeout=30)
soup = BeautifulSoup(resp.text, "lxml")
return {
"url": url,
"title": (soup.title.string or "").strip(),
"headings": [h.text.strip() for h in soup.select("h2")],
}
async def main():
async with httpx.AsyncClient(proxy=PROXY) as client:
urls = [
f"https://example.fr/products?page={i}" for i in range(1, 51)
]
results = await asyncio.gather(*[fetch(client, u) for u in urls])
for r in results:
print(r["url"], "->", r["title"])
asyncio.run(main()) Preguntas frecuentes
Preguntas frecuentes sobre el uso de Shifter con Beautiful Soup.
No. Beautiful Soup es un analizador sintáctico y no realiza solicitudes HTTP. El proxy se configura en el cliente HTTP que uses junto con bs4 (requests, httpx, aiohttp, urllib). Una vez obtenido el HTML a través de Shifter, se lo pasas a BeautifulSoup() de la forma habitual.
Empieza a usar Shifter con Beautiful Soup
Combina los más de 205M de proxies residenciales y de ISP de Shifter con Beautiful Soup para un scraping en Python limpio y expresivo. Rotación por solicitud, sesiones persistentes y soporte asíncrono completo mediante httpx.