diff --git a/unshackle/core/session.py b/unshackle/core/session.py index 4cda472..d1a03e2 100644 --- a/unshackle/core/session.py +++ b/unshackle/core/session.py @@ -2,9 +2,16 @@ from __future__ import annotations +import logging +import random +import time import warnings +from datetime import datetime, timezone +from email.utils import parsedate_to_datetime +from typing import Any, List, Optional, Set, Tuple +from urllib.parse import urlparse -from curl_cffi.requests import Session as CurlSession +from curl_cffi import Response, Session, exceptions from unshackle.core.config import config @@ -15,18 +22,91 @@ warnings.filterwarnings( ) -class Session(CurlSession): - """curl_cffi Session with warning suppression.""" +class MaxRetriesError(exceptions.RequestException): + def __init__(self, message, cause=None): + super().__init__(message) + self.__cause__ = cause - def request(self, method, url, **kwargs): - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", message="Make sure you are using https over https proxy.*", category=RuntimeWarning - ) +class CurlSession(Session): + def __init__( + self, + max_retries: int = 10, + backoff_factor: float = 0.2, + max_backoff: float = 60.0, + status_forcelist: Optional[List[int]] = None, + allowed_methods: Optional[Set[str]] = None, + catch_exceptions: Optional[Tuple[type[Exception], ...]] = None, + **session_kwargs: Any, + ): + super().__init__(**session_kwargs) + + self.max_retries = max_retries + self.backoff_factor = backoff_factor + self.max_backoff = max_backoff + self.status_forcelist = status_forcelist or [429, 500, 502, 503, 504] + self.allowed_methods = allowed_methods or {"GET", "POST", "HEAD", "OPTIONS", "PUT", "DELETE", "TRACE"} + self.catch_exceptions = catch_exceptions or ( + exceptions.ConnectionError, + exceptions.SSLError, + exceptions.Timeout, + ) + self.log = logging.getLogger(self.__class__.__name__) + + def _get_sleep_time(self, response: Response | None, attempt: int) -> float | None: + if response: + retry_after = response.headers.get("Retry-After") + if retry_after: + try: + return float(retry_after) + except ValueError: + if retry_date := parsedate_to_datetime(retry_after): + return (retry_date - datetime.now(timezone.utc)).total_seconds() + + if attempt == 0: + return 0.0 + + backoff_value = self.backoff_factor * (2 ** (attempt - 1)) + jitter = backoff_value * 0.1 + sleep_time = backoff_value + random.uniform(-jitter, jitter) + return min(sleep_time, self.max_backoff) + + def request(self, method: str, url: str, **kwargs: Any) -> Response: + if method.upper() not in self.allowed_methods: return super().request(method, url, **kwargs) + last_exception = None + response = None -def session(browser: str | None = None, **kwargs) -> Session: + for attempt in range(self.max_retries + 1): + try: + response = super().request(method, url, **kwargs) + if response.status_code not in self.status_forcelist: + return response + last_exception = exceptions.HTTPError(f"Received status code: {response.status_code}") + self.log.warning( + f"{response.status_code} {response.reason}({urlparse(url).path}). Retrying... " + f"({attempt + 1}/{self.max_retries})" + ) + + except self.catch_exceptions as e: + last_exception = e + response = None + self.log.warning( + f"{e.__class__.__name__}({urlparse(url).path}). Retrying... " + f"({attempt + 1}/{self.max_retries})" + ) + + if attempt < self.max_retries: + if sleep_duration := self._get_sleep_time(response, attempt + 1): + if sleep_duration > 0: + time.sleep(sleep_duration) + else: + break + + raise MaxRetriesError(f"Max retries exceeded for {method} {url}", cause=last_exception) + + +def session(browser: str | None = None, **kwargs) -> CurlSession: """ Create a curl_cffi session that impersonates a browser. @@ -48,32 +128,43 @@ def session(browser: str | None = None, **kwargs) -> Session: - allow_redirects: Follow redirects (bool, default True) - max_redirects: Maximum redirect count (int) - cert: Client certificate (str or tuple) + - ja3: JA3 fingerprint (str) + - akamai: Akamai fingerprint (str) + + Extra arguments for retry handler: + - max_retries: Maximum number of retries (int, default 10) + - backoff_factor: Backoff factor (float, default 0.2) + - max_backoff: Maximum backoff time (float, default 60.0) + - status_forcelist: List of status codes to force retry (list, default [429, 500, 502, 503, 504]) + - allowed_methods: List of allowed HTTP methods (set, default {"GET", "POST", "HEAD", "OPTIONS", "PUT", "DELETE", "TRACE"}) + - catch_exceptions: List of exceptions to catch (tuple, default (exceptions.ConnectionError, exceptions.SSLError, exceptions.Timeout)) Returns: curl_cffi.requests.Session configured with browser impersonation, common headers, and equivalent retry behavior to requests.Session. Example: - from unshackle.core.session import session + from unshackle.core.session import session as CurlSession class MyService(Service): @staticmethod - def get_session(): - return session() # Uses config default browser + def get_session() -> CurlSession: + session = CurlSession( + impersonate="chrome", + ja3="...", + akamai="...", + max_retries=5, + status_forcelist=[429, 500], + allowed_methods={"GET", "HEAD", "OPTIONS"}, + ) + return session # Uses config default browser """ - if browser is None: - browser = config.curl_impersonate.get("browser", "chrome124") session_config = { - "impersonate": browser, - "timeout": 30.0, - "allow_redirects": True, - "max_redirects": 15, - "verify": True, + "impersonate": browser or config.curl_impersonate.get("browser", "chrome"), + **kwargs, } - session_config.update(kwargs) - session_obj = Session(**session_config) + session_obj = CurlSession(**session_config) session_obj.headers.update(config.headers) - return session_obj