diff --git a/CHANGELOG.md b/CHANGELOG.md index 9ee4821..dfbcd05 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,107 @@ # Bright Data Python SDK Changelog +## Version 2.1.0 - API Simplification & Naming Convention Fix + +### ๐Ÿšจ Breaking Changes + +#### Removed GenericScraper +```python +# OLD (v2.0.0) +result = await client.scrape.generic.url("https://example.com") + +# NEW (v2.1.0) - Use scrape_url() directly +result = await client.scrape_url("https://example.com") +``` + +#### Async Method Naming Convention +The `_async` suffix has been removed. Now `method()` is async by default, and `method_sync()` is the synchronous version. + +```python +# OLD (v2.0.0) +result = await scraper.products_async(url) +await job.wait_async() +data = await job.fetch_async() + +# NEW (v2.1.0) +result = await scraper.products(url) +await job.wait() +data = await job.fetch() +``` + +#### CLI Command Change +```bash +# OLD +brightdata scrape generic --url https://example.com + +# NEW +brightdata scrape url --url https://example.com +``` + +### โœจ New Features + +#### Complete SyncBrightDataClient +Added comprehensive `sync_client.py` with full coverage for all scrapers: + +```python +from brightdata import SyncBrightDataClient + +with SyncBrightDataClient(token="...") as client: + # All methods work synchronously + result = client.scrape.amazon.products(url) + result = client.scrape.linkedin.profiles(url) + result = client.search.google("query") +``` + +**Supported sync wrappers:** +- `SyncAmazonScraper` - products, reviews, sellers (+ trigger/status/fetch) +- `SyncLinkedInScraper` - profiles, jobs, companies, posts +- `SyncInstagramScraper` - profiles, posts, comments, reels +- `SyncFacebookScraper` - posts_by_profile, posts_by_group, comments, reels +- `SyncChatGPTScraper` - prompt, prompts +- `SyncSearchService` - google, bing, yandex +- `SyncCrawlerService` - crawl, scrape + +#### Context Manager Enforcement +Client methods now require proper context manager initialization: + +```python +# Correct usage +async with BrightDataClient() as client: + result = await client.scrape_url(url) + +# Will raise RuntimeError +client = BrightDataClient() +result = await client.scrape_url(url) # Error: not initialized +``` + +### ๐Ÿ”„ Migration Guide + +#### Method Renames +| Old (v2.0.0) | New (v2.1.0) | +|--------------|--------------| +| `products_async()` | `products()` | +| `reviews_async()` | `reviews()` | +| `profiles_async()` | `profiles()` | +| `jobs_async()` | `jobs()` | +| `wait_async()` | `wait()` | +| `fetch_async()` | `fetch()` | +| `to_result_async()` | `to_result()` | +| `status_async()` | `status()` | +| `scrape.generic.url()` | `scrape_url()` | + +#### Quick Migration +```bash +# Find and replace in your codebase: +_async() โ†’ () +scrape.generic.url โ†’ scrape_url +``` + +### ๐Ÿ“ Documentation +- Simplified README with clearer examples +- Updated all examples and tests to use new naming convention + +--- + ## Version 2.0.0 - Complete Architecture Rewrite ### ๐Ÿšจ Breaking Changes @@ -50,14 +152,14 @@ with ThreadPoolExecutor(max_workers=10) as executor: **New**: Native async/await throughout with sync wrappers ```python -# New approach - native async -async def scrape_async(self, url): +# New approach - native async (method() is async by default) +async def products(self, url): async with self.engine: return await self._execute_workflow(...) -# Sync wrapper for compatibility -def scrape(self, url): - return asyncio.run(self.scrape_async(url)) +# Sync client uses persistent event loop +with SyncBrightDataClient() as client: + result = client.scrape.amazon.products(url) ``` #### 2. Service-Based Architecture @@ -102,11 +204,11 @@ data = await fetch_results(snapshot_id) # Get results #### 2. Manual Job Control ```python # New capability - fine-grained control over scraping jobs -job = await scraper.trigger(url) +job = await scraper.products_trigger(url) # Do other work... -status = await job.status_async() +status = await job.status() if status == "ready": - data = await job.fetch_async() + data = await job.fetch() ``` #### 3. Type-Safe Payloads (Dataclasses) @@ -270,11 +372,11 @@ result = client.scrape(url) # New (async-first) async def main(): async with BrightDataClient(token="...") as client: - result = await client.scrape_url_async(url) + result = await client.scrape_url(url) -# Or keep using sync -client = BrightDataClient(token="...") -result = client.scrape_url(url) +# Or use sync client +with SyncBrightDataClient(token="...") as client: + result = client.scrape_url(url) ``` diff --git a/README.md b/README.md index 9d08843..0d7b826 100644 --- a/README.md +++ b/README.md @@ -1,1471 +1,161 @@ -# Bright Data Python SDK ๐Ÿ +# Bright Data Python SDK + +The official Python SDK for [Bright Data](https://brightdata.com) APIs. Scrape any website, get SERP results, bypass bot detection and CAPTCHAs. -[![Tests](https://img.shields.io/badge/tests-502%2B%20passing-brightgreen)](https://github.com/brightdata/sdk-python) [![Python](https://img.shields.io/badge/python-3.9%2B-blue)](https://www.python.org/) [![License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE) -[![Code Quality](https://img.shields.io/badge/quality-enterprise--grade-gold)](https://github.com/brightdata/sdk-python) -[![Notebooks](https://img.shields.io/badge/jupyter-5%20notebooks-orange)](notebooks/) - -Modern async-first Python SDK for [Bright Data](https://brightdata.com) APIs with **dataclass payloads**, **Jupyter notebooks**, comprehensive platform support, and **CLI tool** - built for data scientists and developers. - ---- - -## ๐Ÿ“‘ Table of Contents - -- [โœจ Features](#-features) -- [๐Ÿ““ Jupyter Notebooks](#-jupyter-notebooks-new) -- [๐Ÿ“ฆ Installation](#-installation) -- [๐Ÿš€ Quick Start](#-quick-start) - - [Authentication](#authentication) - - [Simple Web Scraping](#simple-web-scraping) - - [Using Dataclass Payloads](#using-dataclass-payloads-type-safe-) - - [Pandas Integration](#pandas-integration-for-data-scientists-) - - [Platform-Specific Scraping](#platform-specific-scraping) - - [Search Engine Results (SERP)](#search-engine-results-serp) - - [Async Usage](#async-usage) -- [๐Ÿ†• What's New in v2.0.0](#-whats-new-in-v2-200) -- [๐Ÿ—๏ธ Architecture](#๏ธ-architecture) -- [๐Ÿ“š API Reference](#-api-reference) - - [Client Initialization](#client-initialization) - - [Connection Testing](#connection-testing) - - [Zone Management](#zone-management) - - [Result Objects](#result-objects) -- [๐Ÿ–ฅ๏ธ CLI Usage](#๏ธ-cli-usage) -- [๐Ÿผ Pandas Integration](#-pandas-integration) -- [๐ŸŽจ Dataclass Payloads](#-dataclass-payloads) -- [๐Ÿ”ง Advanced Usage](#-advanced-usage) -- [๐Ÿงช Testing](#-testing) -- [๐Ÿ›๏ธ Design Philosophy](#๏ธ-design-philosophy) -- [๐Ÿ“– Documentation](#-documentation) -- [๐Ÿ”ง Troubleshooting](#-troubleshooting) -- [๐Ÿค Contributing](#-contributing) -- [๐Ÿ“Š Project Stats](#-project-stats) -- [๐Ÿ“ License](#-license) -- [๐Ÿ”— Links](#-links) -- [๐Ÿ’ก Examples](#-examples) -- [๐ŸŽฏ Roadmap](#-roadmap) -- [๐Ÿ™ Acknowledgments](#-acknowledgments) -- [๐ŸŒŸ Why Choose This SDK?](#-why-choose-this-sdk) - ---- - -## โœจ Features - -### ๐ŸŽฏ **For Data Scientists** -- ๐Ÿ““ **5 Jupyter Notebooks** - Complete tutorials from quickstart to batch processing -- ๐Ÿผ **Pandas Integration** - Native DataFrame support with examples -- ๐Ÿ“Š **Data Analysis Ready** - Built-in visualization, export to CSV/Excel -- ๐Ÿ’ฐ **Cost Tracking** - Budget management and cost analytics -- ๐Ÿ”„ **Progress Bars** - tqdm integration for batch operations -- ๐Ÿ’พ **Caching Support** - joblib integration for development - -### ๐Ÿ—๏ธ **Core Features** -- ๐Ÿš€ **Async-first architecture** with sync wrappers for compatibility -- ๐ŸŽจ **Dataclass Payloads** - Runtime validation, IDE autocomplete, helper methods -- ๐ŸŒ **Web scraping** via Web Unlocker proxy service -- ๐Ÿ” **SERP API** - Google, Bing, Yandex search results -- ๐Ÿ“ฆ **Platform scrapers** - LinkedIn, Amazon, ChatGPT, Facebook, Instagram -- ๐ŸŽฏ **Dual namespace** - `scrape` (URL-based) + `search` (discovery) -- ๐Ÿ–ฅ๏ธ **CLI Tool** - `brightdata` command for terminal usage - -### ๐Ÿ›ก๏ธ **Enterprise Grade** -- ๐Ÿ”’ **100% type safety** - Dataclasses + TypedDict definitions -- โœ… **502+ comprehensive tests** - Unit, integration, and E2E -- โšก **Resource efficient** - Single shared AsyncEngine -- ๐ŸŽจ **Rich result objects** - Timing, cost tracking, method tracking -- ๐Ÿ” **.env file support** - Automatic loading via python-dotenv -- ๐Ÿ›ก๏ธ **SSL error handling** - Helpful guidance for certificate issues -- ๐Ÿ“Š **Function-level monitoring** - Track which SDK methods are used - ---- - -## ๐Ÿ““ Jupyter Notebooks (NEW!) - -Perfect for data scientists! Interactive tutorials with examples: -1. **[01_quickstart.ipynb](notebooks/01_quickstart.ipynb)** - Get started in 5 minutes [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/brightdata/sdk-python/blob/main/notebooks/01_quickstart.ipynb) -2. **[02_pandas_integration.ipynb](notebooks/02_pandas_integration.ipynb)** - Work with DataFrames [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/brightdata/sdk-python/blob/main/notebooks/02_pandas_integration.ipynb) -3. **[03_amazon_scraping.ipynb](notebooks/03_amazon_scraping.ipynb)** - Amazon deep dive [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/brightdata/sdk-python/blob/main/notebooks/03_amazon_scraping.ipynb) -4. **[04_linkedin_jobs.ipynb](notebooks/04_linkedin_jobs.ipynb)** - Job market analysis [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/brightdata/sdk-python/blob/main/notebooks/04_linkedin_jobs.ipynb) -5. **[05_batch_processing.ipynb](notebooks/05_batch_processing.ipynb)** - Scale to 1000s of URLs [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/brightdata/sdk-python/blob/main/notebooks/05_batch_processing.ipynb) - ---- - -## ๐Ÿ“ฆ Installation +## Installation ```bash pip install brightdata-sdk ``` -Or install from source: - -```bash -git clone https://github.com/brightdata/sdk-python.git -cd sdk-python -pip install -e . -``` - ---- +## Configuration -## ๐Ÿš€ Quick Start - -### Authentication - -Set your API token as an environment variable: +Get your API Token from the [Bright Data Control Panel](https://brightdata.com/cp/api_keys): ```bash export BRIGHTDATA_API_TOKEN="your_api_token_here" -export BRIGHTDATA_CUSTOMER_ID="your_customer_id" # Optional -``` - -Or use a `.env` file (automatically loaded): - -```bash -# .env -BRIGHTDATA_API_TOKEN=your_api_token_here -BRIGHTDATA_CUSTOMER_ID=your_customer_id # Optional -``` - -Or pass credentials directly: - -```python -from brightdata import BrightDataClient - -client = BrightDataClient( - token="your_api_token", - customer_id="your_customer_id" # Optional -) -``` - -### Simple Web Scraping - -```python -from brightdata import BrightDataClient - -# Initialize client (auto-loads token from environment) -client = BrightDataClient() - -# Scrape any website (sync wrapper) -result = client.scrape.generic.url("https://example.com") - -if result.success: -print(f"Success: {result.success}") -print(f"Data: {result.data[:200]}...") -print(f"Time: {result.elapsed_ms():.2f}ms") -else: - print(f"Error: {result.error}") -``` - -### Using Dataclass Payloads (Type-Safe โœจ) - -```python -from brightdata import BrightDataClient -from brightdata.payloads import AmazonProductPayload, LinkedInJobSearchPayload - -client = BrightDataClient() - -# Amazon with validated payload -payload = AmazonProductPayload( - url="https://amazon.com/dp/B123456789", - reviews_count=50 # Runtime validated! -) -print(f"ASIN: {payload.asin}") # Helper property - -result = client.scrape.amazon.products(**payload.to_dict()) - -# LinkedIn job search with validation -job_payload = LinkedInJobSearchPayload( - keyword="python developer", - location="New York", - remote=True -) -print(f"Remote search: {job_payload.is_remote_search}") - -jobs = client.search.linkedin.jobs(**job_payload.to_dict()) -``` - -### Pandas Integration for Data Scientists ๐Ÿผ - -```python -import pandas as pd -from brightdata import BrightDataClient - -client = BrightDataClient() - -# Scrape multiple products -urls = ["https://amazon.com/dp/B001", "https://amazon.com/dp/B002"] -results = [] - -for url in urls: - result = client.scrape.amazon.products(url=url) - if result.success: - results.append({ - 'title': result.data.get('title'), - 'price': result.data.get('final_price'), - 'rating': result.data.get('rating'), - 'cost': result.cost - }) - -# Convert to DataFrame -df = pd.DataFrame(results) -print(df.describe()) - -# Export to CSV -df.to_csv('products.csv', index=False) -``` - -### Platform-Specific Scraping - -#### Amazon Products - -```python -# Scrape specific product URLs -result = client.scrape.amazon.products( - url="https://amazon.com/dp/B0CRMZHDG8", - timeout=65 -) - -# Extract reviews with filters -result = client.scrape.amazon.reviews( - url="https://amazon.com/dp/B0CRMZHDG8", - pastDays=30, - keyWord="quality", - numOfReviews=100 -) - -# Scrape seller information -result = client.scrape.amazon.sellers( - url="https://amazon.com/sp?seller=AXXXXXXXXX" -) - -# NEW: Search Amazon by keyword and filters -result = client.search.amazon.products( - keyword="laptop", - min_price=50000, # $500 in cents - max_price=200000, # $2000 in cents - prime_eligible=True, - condition="new" -) - -# Search by category -result = client.search.amazon.products( - keyword="wireless headphones", - category="electronics" -) -``` - -#### LinkedIn Data - -```python -# URL-based extraction -result = client.scrape.linkedin.profiles( - url="https://linkedin.com/in/johndoe" -) - -result = client.scrape.linkedin.jobs( - url="https://linkedin.com/jobs/view/123456" -) - -result = client.scrape.linkedin.companies( - url="https://linkedin.com/company/microsoft" -) - -result = client.scrape.linkedin.posts( - url="https://linkedin.com/feed/update/..." -) - -# Discovery/search operations -result = client.search.linkedin.jobs( - keyword="python developer", - location="New York", - remote=True, - experienceLevel="mid" -) - -result = client.search.linkedin.profiles( - firstName="John", - lastName="Doe" -) - -result = client.search.linkedin.posts( - profile_url="https://linkedin.com/in/johndoe", - start_date="2025-01-01", - end_date="2025-12-31" -) -``` - -#### ChatGPT Interactions - -```python -# Send single prompt to ChatGPT -result = client.scrape.chatgpt.prompt( - prompt="Explain Python async programming", - country="us", - web_search=True -) - -# Batch prompts -result = client.scrape.chatgpt.prompts( - prompts=["What is Python?", "What is JavaScript?", "Compare them"], - web_searches=[False, False, True] -) -``` - -#### Facebook Data - -```python -# Scrape posts from profile -result = client.scrape.facebook.posts_by_profile( - url="https://facebook.com/profile", - num_of_posts=10, - start_date="01-01-2025", - end_date="12-31-2025", - timeout=240 -) - -# Scrape posts from group -result = client.scrape.facebook.posts_by_group( - url="https://facebook.com/groups/example", - num_of_posts=20, - timeout=240 -) - -# Scrape specific post -result = client.scrape.facebook.posts_by_url( - url="https://facebook.com/post/123456", - timeout=240 -) - -# Scrape comments from post -result = client.scrape.facebook.comments( - url="https://facebook.com/post/123456", - num_of_comments=100, - start_date="01-01-2025", - end_date="12-31-2025", - timeout=240 -) - -# Scrape reels from profile -result = client.scrape.facebook.reels( - url="https://facebook.com/profile", - num_of_posts=50, - timeout=240 -) -``` - -#### Instagram Data - -```python -# Scrape Instagram profile -result = client.scrape.instagram.profiles( - url="https://instagram.com/username", - timeout=240 -) - -# Scrape specific post -result = client.scrape.instagram.posts( - url="https://instagram.com/p/ABC123", - timeout=240 -) - -# Scrape comments from post -result = client.scrape.instagram.comments( - url="https://instagram.com/p/ABC123", - timeout=240 -) - -# Scrape specific reel -result = client.scrape.instagram.reels( - url="https://instagram.com/reel/ABC123", - timeout=240 -) - -# Discover posts from profile (with filters) -result = client.search.instagram.posts( - url="https://instagram.com/username", - num_of_posts=10, - start_date="01-01-2025", - end_date="12-31-2025", - post_type="reel", - timeout=240 -) - -# Discover reels from profile -result = client.search.instagram.reels( - url="https://instagram.com/username", - num_of_posts=50, - start_date="01-01-2025", - end_date="12-31-2025", - timeout=240 -) ``` -### Search Engine Results (SERP) - -```python -# Google search -result = client.search.google( - query="python tutorial", - location="United States", - language="en", - num_results=20 -) +## Quick Start -# Access results -for item in result.data: - print(f"{item['position']}. {item['title']}") - print(f" {item['url']}") - -# Bing search -result = client.search.bing( - query="python tutorial", - location="United States" -) - -# Yandex search -result = client.search.yandex( - query="python tutorial", - location="Russia" -) -``` - -### Async Usage - -For better performance with multiple operations, use async: +This SDK is **async-native**. A sync client is also available (see [Sync Client](#sync-client)). ```python import asyncio from brightdata import BrightDataClient -async def scrape_multiple(): - # Use async context manager for engine lifecycle +async def main(): async with BrightDataClient() as client: - # Scrape multiple URLs concurrently - results = await client.scrape.generic.url_async([ - "https://example1.com", - "https://example2.com", - "https://example3.com" - ]) - - for result in results: - print(f"Success: {result.success}") + result = await client.scrape_url("https://example.com") + print(result.data) -asyncio.run(scrape_multiple()) +asyncio.run(main()) ``` -**Important:** When using `*_async` methods, always use the async context manager (`async with BrightDataClient() as client`). Sync wrappers (methods without `_async`) handle this automatically. - ---- - -## ๐Ÿ†• What's New in v2 2.0.0 - -### ๐Ÿ†• **Latest Updates (December 2025)** -- โœ… **Amazon Search API** - NEW parameter-based product discovery with correct dataset -- โœ… **LinkedIn Job Search Fixed** - Now builds URLs from keywords internally -- โœ… **Trigger Interface** - Manual trigger/poll/fetch control for all platforms -- โœ… **29 Sync Wrapper Fixes** - All sync methods work (scrapers + SERP API) -- โœ… **Batch Operations Fixed** - Returns List[ScrapeResult] correctly -- โœ… **Auto-Create Zones** - Now enabled by default (was opt-in) -- โœ… **Improved Zone Names** - `sdk_unlocker`, `sdk_serp`, `sdk_browser` -- โœ… **Full Sync/Async Examples** - README now shows both patterns for all features - -### ๐ŸŽ“ **For Data Scientists** -- โœ… **5 Jupyter Notebooks** - Complete interactive tutorials -- โœ… **Pandas Integration** - Native DataFrame support with examples -- โœ… **Batch Processing Guide** - Scale to 1000s of URLs with progress bars -- โœ… **Cost Management** - Budget tracking and optimization -- โœ… **Visualization Examples** - matplotlib/seaborn integration - -### ๐ŸŽจ **Dataclass Payloads (Major Upgrade)** -- โœ… **Runtime Validation** - Catch errors at instantiation time -- โœ… **Helper Properties** - `.asin`, `.is_remote_search`, `.domain`, etc. -- โœ… **IDE Autocomplete** - Full IntelliSense support -- โœ… **Default Values** - Smart defaults (e.g., `country="US"`) -- โœ… **to_dict() Method** - Easy API conversion -- โœ… **Consistent Model** - Same pattern as result models - -### ๐Ÿ–ฅ๏ธ **CLI Tool** -- โœ… **`brightdata` command** - Use SDK from terminal -- โœ… **Scrape operations** - `brightdata scrape amazon products ...` -- โœ… **Search operations** - `brightdata search amazon products --keyword ...` -- โœ… **Output formats** - JSON, pretty-print, minimal - -### ๐Ÿ—๏ธ **Architecture Improvements** -- โœ… **Single AsyncEngine** - Shared across all scrapers (8x efficiency) -- โœ… **Resource Optimization** - Reduced memory footprint -- โœ… **Enhanced Error Messages** - Clear, actionable error messages -- โœ… **500+ Tests Passing** - Comprehensive test coverage (99.4%) - -### ๐Ÿ†• **Platforms & Features** -- โœ… **Amazon Search** - Keyword-based product discovery -- โœ… **Facebook Scraper** - Posts (profile/group/URL), Comments, Reels -- โœ… **Instagram Scraper** - Profiles, Posts, Comments, Reels -- โœ… **Instagram Search** - Posts and Reels discovery with filters - ---- - -## ๐Ÿ—๏ธ Architecture - -### Hierarchical Service Access - -The SDK provides a clean, intuitive interface organized by operation type: - -```python -client = BrightDataClient() - -# URL-based extraction (scrape namespace) -client.scrape.amazon.products(url="...") -client.scrape.linkedin.profiles(url="...") -client.scrape.facebook.posts_by_profile(url="...") -client.scrape.instagram.profiles(url="...") -client.scrape.generic.url(url="...") - -# Parameter-based discovery (search namespace) -client.search.amazon.products(keyword="...", min_price=..., max_price=...) -client.search.linkedin.jobs(keyword="...", location="...") -client.search.instagram.posts(url="...", num_of_posts=10) -client.search.google(query="...") -client.scrape.chatgpt.prompt(prompt="...") - -# Direct service access (advanced) -client.web_unlocker.fetch(url="...") -client.crawler.discover(url="...") # Coming soon -``` - -### Core Components - -- **`BrightDataClient`** - Main entry point with authentication and .env support -- **`ScrapeService`** - URL-based data extraction -- **`SearchService`** - Parameter-based discovery -- **Result Models** - `ScrapeResult`, `SearchResult`, `CrawlResult` with method tracking -- **Platform Scrapers** - Amazon, LinkedIn, ChatGPT, Facebook, Instagram with registry pattern -- **SERP Services** - Google, Bing, Yandex search -- **Type System** - 100% type safety with TypedDict -- **Constants Module** - Centralized configuration (no magic numbers) -- **SSL Helpers** - Platform-specific error guidance -- **Function Detection** - Automatic SDK function tracking for monitoring - ---- - -## ๐Ÿ“š API Reference - -### Client Initialization - -```python -client = BrightDataClient( - token="your_token", # Auto-loads from BRIGHTDATA_API_TOKEN if not provided - customer_id="your_customer_id", # Auto-loads from BRIGHTDATA_CUSTOMER_ID (optional) - timeout=30, # Default timeout in seconds - web_unlocker_zone="sdk_unlocker", # Web Unlocker zone name (default) - serp_zone="sdk_serp", # SERP API zone name (default) - browser_zone="sdk_browser", # Browser API zone name (default) - auto_create_zones=True, # Auto-create missing zones (default: True) - validate_token=False # Validate token on init (default: False) -) -``` - -**Environment Variables:** -- `BRIGHTDATA_API_TOKEN` - Your API token (required) -- `BRIGHTDATA_CUSTOMER_ID` - Your customer ID (optional) - -Both are automatically loaded from environment or `.env` file. - -### Connection Testing - -```python -# Test API connection -is_valid = await client.test_connection() -is_valid = client.test_connection_sync() # Synchronous version - -# Get account information -info = await client.get_account_info() -info = client.get_account_info_sync() - -print(f"Zones: {info['zone_count']}") -print(f"Active zones: {[z['name'] for z in info['zones']]}") -``` - -### Zone Management - -The SDK can automatically create required zones if they don't exist, or you can manage zones manually. +## Usage Examples -#### Automatic Zone Creation - -Enable automatic zone creation when initializing the client: +### Web Scraping ```python -client = BrightDataClient( - token="your_token", - auto_create_zones=True # Automatically create zones if missing -) - -# Zones are created on first API call -async with client: - # sdk_unlocker, sdk_serp, and sdk_browser zones created automatically if needed - result = await client.scrape.amazon.products(url="...") +async with BrightDataClient() as client: + result = await client.scrape_url("https://example.com") + print(result.data) ``` -#### Manual Zone Management - -List and manage zones programmatically: +### Search Engines (SERP) ```python -# List all zones -zones = await client.list_zones() -zones = client.list_zones_sync() # Synchronous version - -for zone in zones: - print(f"Zone: {zone['name']} (Type: {zone.get('type', 'unknown')})") - -# Advanced: Use ZoneManager directly -from brightdata import ZoneManager - -async with client.engine: - zone_manager = ZoneManager(client.engine) - - # Ensure specific zones exist - await zone_manager.ensure_required_zones( - web_unlocker_zone="my_custom_zone", - serp_zone="my_serp_zone" - ) +async with BrightDataClient() as client: + result = await client.search.google(query="python scraping", num_results=10) + for item in result.data: + print(item) ``` -**Zone Creation API:** -- Endpoint: `POST https://api.brightdata.com/zone` -- Zones are created via the Bright Data API -- Supported zone types: `unblocker`, `serp`, `browser` -- Automatically handles duplicate zones gracefully +### Web Scraper API -### Result Objects +The SDK includes ready-to-use scrapers for popular websites: Amazon, LinkedIn, Instagram, Facebook, and more. -All operations return rich result objects with timing and metadata: +**Pattern:** `client.scrape..(url)` +**Example: Amazon** ```python -result = client.scrape.amazon.products(url="...") +async with BrightDataClient() as client: + # Product details + result = await client.scrape.amazon.products(url="https://amazon.com/dp/B0CRMZHDG8") -# Access data -result.success # bool - Operation succeeded -result.data # Any - Scraped data -result.error # str | None - Error message if failed -result.cost # float | None - Cost in USD -result.platform # str | None - Platform name (e.g., "linkedin", "amazon") -result.method # str | None - Method used: "web_scraper", "web_unlocker", "browser_api" + # Reviews + result = await client.scrape.amazon.reviews(url="https://amazon.com/dp/B0CRMZHDG8") -# Timing information -result.elapsed_ms() # Total time in milliseconds -result.get_timing_breakdown() # Detailed timing dict - -# Serialization -result.to_dict() # Convert to dictionary -result.to_json(indent=2) # JSON string -result.save_to_file("result.json") # Save to file + # Sellers + result = await client.scrape.amazon.sellers(url="https://amazon.com/dp/B0CRMZHDG8") ``` ---- - -## ๐Ÿ–ฅ๏ธ CLI Usage - -The SDK includes a powerful CLI tool: - -```bash -# Help -brightdata --help - -# Scrape Amazon product (URL is positional argument) -brightdata scrape amazon products \ - "https://amazon.com/dp/B0CRMZHDG8" +**Available scrapers:** +- `client.scrape.amazon` - products, reviews, sellers +- `client.scrape.linkedin` - profiles, companies, jobs, posts +- `client.scrape.instagram` - profiles, posts, comments, reels +- `client.scrape.facebook` - posts, comments, reels -# Search LinkedIn jobs -brightdata search linkedin jobs \ - --keyword "python developer" \ - --location "New York" \ - --remote \ - --output-file jobs.json - -# Search Google (query is positional argument) -brightdata search google \ - "python tutorial" \ - --location "United States" - -# Generic web scraping (URL is positional argument) -brightdata scrape generic \ - "https://example.com" \ - --response-format raw \ - --output-format pretty -``` - -### Available Commands - -**Scrape Operations:** -- `brightdata scrape amazon products/reviews/sellers` -- `brightdata scrape linkedin profiles/jobs/companies/posts` -- `brightdata scrape facebook posts-profile/posts-group/comments/reels` -- `brightdata scrape instagram profiles/posts/comments/reels` -- `brightdata scrape chatgpt prompt` -- `brightdata scrape generic url` - -**Search Operations:** -- `brightdata search amazon products` -- `brightdata search linkedin jobs/profiles/posts` -- `brightdata search instagram posts/reels` -- `brightdata search google/bing/yandex` -- `brightdata search chatgpt` - -### CLI Output Formats - -The CLI supports two different format parameters for different purposes: - -#### Global Output Format (`--output-format`) - -Controls **how results are displayed** (available for ALL commands): - -```bash -# JSON format (default) - Full structured output -brightdata scrape amazon products "https://amazon.com/dp/B123" --output-format json +## Async Usage -# Pretty format - Human-readable with formatted output -brightdata scrape amazon products "https://amazon.com/dp/B123" --output-format pretty - -# Minimal format - Just the data, no metadata -brightdata scrape amazon products "https://amazon.com/dp/B123" --output-format minimal -``` - -#### Generic Scraper Response Format (`--response-format`) - -Controls **what the API returns** (generic scraper only): - -```bash -# Raw format (default) - Returns HTML/text as-is -brightdata scrape generic "https://example.com" --response-format raw - -# JSON format - API attempts to parse as JSON -brightdata scrape generic "https://api.example.com/data" --response-format json -``` - -**Note:** You can combine both: -```bash -brightdata scrape generic "https://example.com" \ - --response-format raw \ - --output-format pretty -``` - ---- - -## ๐Ÿผ Pandas Integration - -Perfect for data analysis workflows: +Run multiple requests concurrently: ```python -import pandas as pd -from tqdm import tqdm -from brightdata import BrightDataClient -from brightdata.payloads import AmazonProductPayload - -client = BrightDataClient() - -# Batch scrape with progress bar -urls = ["https://amazon.com/dp/B001", "https://amazon.com/dp/B002"] -results = [] - -for url in tqdm(urls, desc="Scraping"): - payload = AmazonProductPayload(url=url) - result = client.scrape.amazon.products(**payload.to_dict()) - - if result.success: - results.append({ - 'asin': payload.asin, - 'title': result.data.get('title'), - 'price': result.data.get('final_price'), - 'rating': result.data.get('rating'), - 'cost': result.cost, - 'elapsed_ms': result.elapsed_ms() - }) - -# Create DataFrame -df = pd.DataFrame(results) - -# Analysis -print(df.describe()) -print(f"Total cost: ${df['cost'].sum():.4f}") -print(f"Avg rating: {df['rating'].mean():.2f}") - -# Export -df.to_csv('amazon_products.csv', index=False) -df.to_excel('amazon_products.xlsx', index=False) - -# Visualization -import matplotlib.pyplot as plt -df.plot(x='asin', y='rating', kind='bar', title='Product Ratings') -plt.show() -``` - -See **[notebooks/02_pandas_integration.ipynb](notebooks/02_pandas_integration.ipynb)** for complete examples. - ---- - -## ๐ŸŽจ Dataclass Payloads - -All payloads are now dataclasses with runtime validation: - -### Amazon Payloads - -```python -from brightdata.payloads import AmazonProductPayload, AmazonReviewPayload - -# Product with validation -payload = AmazonProductPayload( - url="https://amazon.com/dp/B123456789", - reviews_count=50, - images_count=10 -) - -# Helper properties -print(payload.asin) # "B123456789" -print(payload.domain) # "amazon.com" -print(payload.is_secure) # True - -# Convert to API dict -api_dict = payload.to_dict() # Excludes None values -``` - -### LinkedIn Payloads - -```python -from brightdata.payloads import LinkedInJobSearchPayload - -payload = LinkedInJobSearchPayload( - keyword="python developer", - location="San Francisco", - remote=True, - experienceLevel="mid" -) - -# Helper properties -print(payload.is_remote_search) # True - -# Use with client -result = client.search.linkedin.jobs(**payload.to_dict()) -``` - -### ChatGPT Payloads - -```python -from brightdata.payloads import ChatGPTPromptPayload - -payload = ChatGPTPromptPayload( - prompt="Explain async programming", - web_search=True -) - -# Default values -print(payload.country) # "US" (default) -print(payload.uses_web_search) # True -``` - -### Validation Examples - -```python -# Runtime validation catches errors early -try: - AmazonProductPayload(url="invalid-url") -except ValueError as e: - print(e) # "url must be valid HTTP/HTTPS URL" - -try: - AmazonProductPayload( - url="https://amazon.com/dp/B123", - reviews_count=-1 - ) -except ValueError as e: - print(e) # "reviews_count must be non-negative" -``` - ---- - -## ๐Ÿ”ง Advanced Usage - -### Batch Operations - -```python -# Scrape multiple URLs concurrently -urls = [ - "https://amazon.com/dp/B001", - "https://amazon.com/dp/B002", - "https://amazon.com/dp/B003" -] - -results = client.scrape.amazon.products(url=urls) - -for result in results: - if result.success: - print(f"{result.data['title']}: ${result.data['price']}") -``` - -### Platform-Specific Options - -```python -# Amazon reviews with filters -result = client.scrape.amazon.reviews( - url="https://amazon.com/dp/B123", - pastDays=7, # Last 7 days only - keyWord="quality", # Filter by keyword - numOfReviews=50 # Limit to 50 reviews -) - -# LinkedIn jobs with extensive filters -result = client.search.linkedin.jobs( - keyword="python developer", - location="New York", - country="us", - jobType="full-time", - experienceLevel="mid", - remote=True, - company="Microsoft", - timeRange="past-week" -) -``` - -### Sync vs Async Examples - Full Coverage - -All SDK methods support **both sync and async** patterns. Choose based on your needs: - -#### **Amazon Products** - -```python -# SYNC - Simple scripts -result = client.scrape.amazon.products(url="https://amazon.com/dp/B123") - -# ASYNC - Concurrent operations import asyncio +from brightdata import BrightDataClient -async def scrape_amazon(): - async with BrightDataClient() as client: - result = await client.scrape.amazon.products_async(url="https://amazon.com/dp/B123") - return result - -result = asyncio.run(scrape_amazon()) -``` - -#### **Amazon Search** - -```python -# SYNC - Simple keyword search -result = client.search.amazon.products(keyword="laptop", prime_eligible=True) - -# ASYNC - Batch keyword searches -async def search_amazon(): +async def main(): async with BrightDataClient() as client: - result = await client.search.amazon.products_async( - keyword="laptop", - min_price=50000, - max_price=200000, - prime_eligible=True - ) - return result + urls = ["https://example.com/page1", "https://example.com/page2", "https://example.com/page3"] + tasks = [client.scrape_url(url) for url in urls] + results = await asyncio.gather(*tasks) -result = asyncio.run(search_amazon()) +asyncio.run(main()) ``` -#### **LinkedIn Scraping** - -```python -# SYNC - Single profile -result = client.scrape.linkedin.profiles(url="https://linkedin.com/in/johndoe") - -# ASYNC - Multiple profiles concurrently -async def scrape_linkedin(): - async with BrightDataClient() as client: - urls = ["https://linkedin.com/in/person1", "https://linkedin.com/in/person2"] - results = await client.scrape.linkedin.profiles_async(url=urls) - return results - -results = asyncio.run(scrape_linkedin()) -``` +### Manual Trigger/Poll/Fetch -#### **LinkedIn Job Search** +For long-running scrapes: ```python -# SYNC - Simple job search -result = client.search.linkedin.jobs(keyword="python", location="NYC", remote=True) +async with BrightDataClient() as client: + # Trigger + job = await client.scrape.amazon.products_trigger(url="https://amazon.com/dp/B123") -# ASYNC - Advanced search with filters -async def search_jobs(): - async with BrightDataClient() as client: - result = await client.search.linkedin.jobs_async( - keyword="python developer", - location="New York", - experienceLevel="mid", - jobType="full-time", - remote=True - ) - return result + # Wait for completion + await job.wait(timeout=180) -result = asyncio.run(search_jobs()) + # Fetch results + data = await job.fetch() ``` -#### **SERP API (Google, Bing, Yandex)** - -```python -# SYNC - Quick Google search -result = client.search.google(query="python tutorial", location="United States") - -# ASYNC - Multiple search engines concurrently -async def search_all_engines(): - async with BrightDataClient() as client: - google = await client.search.google_async(query="python", num_results=10) - bing = await client.search.bing_async(query="python", num_results=10) - yandex = await client.search.yandex_async(query="python", num_results=10) - return google, bing, yandex - -results = asyncio.run(search_all_engines()) -``` +## Sync Client -#### **Facebook Scraping** +For simpler use cases, use `SyncBrightDataClient`: ```python -# SYNC - Single profile posts -result = client.scrape.facebook.posts_by_profile( - url="https://facebook.com/profile", - num_of_posts=10 -) +from brightdata import SyncBrightDataClient -# ASYNC - Multiple sources -async def scrape_facebook(): - async with BrightDataClient() as client: - profile_posts = await client.scrape.facebook.posts_by_profile_async( - url="https://facebook.com/zuck", - num_of_posts=10 - ) - group_posts = await client.scrape.facebook.posts_by_group_async( - url="https://facebook.com/groups/programming", - num_of_posts=10 - ) - return profile_posts, group_posts +with SyncBrightDataClient() as client: + result = client.scrape_url("https://example.com") + print(result.data) -results = asyncio.run(scrape_facebook()) + # All methods work the same + result = client.scrape.amazon.products(url="https://amazon.com/dp/B123") + result = client.search.google(query="python") ``` -#### **Instagram Scraping** - -```python -# SYNC - Single profile -result = client.scrape.instagram.profiles(url="https://instagram.com/instagram") +See [docs/sync_client.md](docs/sync_client.md) for details. -# ASYNC - Profile + posts -async def scrape_instagram(): - async with BrightDataClient() as client: - profile = await client.scrape.instagram.profiles_async( - url="https://instagram.com/instagram" - ) - posts = await client.scrape.instagram.posts_async( - url="https://instagram.com/p/ABC123" - ) - return profile, posts - -results = asyncio.run(scrape_instagram()) -``` - -#### **ChatGPT** - -```python -# SYNC - Single prompt -result = client.scrape.chatgpt.prompt(prompt="Explain Python", web_search=True) - -# ASYNC - Batch prompts -async def ask_chatgpt(): - async with BrightDataClient() as client: - result = await client.scrape.chatgpt.prompts_async( - prompts=["What is Python?", "What is JavaScript?"], - web_searches=[False, True] - ) - return result - -result = asyncio.run(ask_chatgpt()) -``` - -#### **Generic Web Scraping** +## Troubleshooting +**RuntimeError: SyncBrightDataClient cannot be used inside async context** ```python -# SYNC - Single URL -result = client.scrape.generic.url(url="https://example.com") +# Wrong - using sync client in async function +async def main(): + with SyncBrightDataClient() as client: # Error! + ... -# ASYNC - Concurrent scraping -async def scrape_multiple(): +# Correct - use async client +async def main(): async with BrightDataClient() as client: - results = await client.scrape.generic.url_async([ - "https://example1.com", - "https://example2.com", - "https://example3.com" - ]) - return results - -results = asyncio.run(scrape_multiple()) + result = await client.scrape_url("https://example.com") ``` ---- - -### **When to Use Sync vs Async** - -**Use Sync When:** -- โœ… Simple scripts or notebooks -- โœ… Single operations at a time -- โœ… Learning or prototyping -- โœ… Sequential workflows - -**Use Async When:** -- โœ… Scraping multiple URLs concurrently -- โœ… Combining multiple API calls -- โœ… Production applications -- โœ… Performance-critical operations - -**Note:** Sync wrappers (e.g., `profiles()`) internally use `asyncio.run()` and cannot be called from within an existing async context. Use `*_async` methods when you're already in an async function. - -### SSL Certificate Error Handling - -The SDK includes comprehensive SSL error handling with platform-specific guidance: - +**RuntimeError: BrightDataClient not initialized** ```python -from brightdata import BrightDataClient -from brightdata.exceptions import SSLError - -try: - client = BrightDataClient() - result = client.scrape.generic.url("https://example.com") -except SSLError as e: - # Helpful error message with platform-specific fix instructions - print(e) - # On macOS, suggests: - # - pip install --upgrade certifi - # - Running Install Certificates.command - # - Setting SSL_CERT_FILE environment variable -``` - -**Common SSL fixes:** - -```bash -# Option 1: Upgrade certifi -pip install --upgrade certifi - -# Option 2: Set SSL_CERT_FILE (macOS/Linux) -export SSL_CERT_FILE=$(python -m certifi) - -# Option 3: Run Install Certificates (macOS python.org installers) -/Applications/Python\ 3.x/Install\ Certificates.command -``` - -### Code Quality Improvements (PR #6) - -Recent architectural refactoring includes: - -#### 1. **Centralized Constants Module** -All magic numbers moved to `constants.py`: -```python -from brightdata.constants import ( - DEFAULT_POLL_INTERVAL, # 10 seconds - DEFAULT_POLL_TIMEOUT, # 600 seconds - DEFAULT_TIMEOUT_SHORT, # 180 seconds - DEFAULT_TIMEOUT_MEDIUM, # 240 seconds - DEFAULT_COST_PER_RECORD, # 0.001 USD -) -``` - -#### 2. **Method Field Instead of Fallback** -Results now track which method was used: -```python -result = client.scrape.amazon.products(url="...") -print(result.method) # "web_scraper", "web_unlocker", or "browser_api" -``` - -#### 3. **Function-Level Monitoring** -Automatic tracking of which SDK functions are called: -```python -# Automatically detected and sent in API requests -result = client.scrape.linkedin.profiles(url="...") -# Internal: sdk_function="profiles" sent to Bright Data -``` - -#### 4. **Service Class Separation** -Clean separation of concerns: -- `ScrapeService` - URL-based extraction -- `SearchService` - Parameter-based discovery -- `CrawlerService` - Web crawling (coming soon) -- `WebUnlockerService` - Direct proxy access - -#### 5. **Enhanced SSL Error Handling** -Platform-specific guidance for certificate issues: -```python -from brightdata.utils.ssl_helpers import ( - is_ssl_certificate_error, - get_ssl_error_message -) -``` - ---- - -## ๐Ÿงช Testing - -The SDK includes 365+ comprehensive tests: - -```bash -# Run all tests -pytest tests/ - -# Run specific test suites -pytest tests/unit/ # Unit tests -pytest tests/integration/ # Integration tests -pytest tests/e2e/ # End-to-end tests - -# Run with coverage -pytest tests/ --cov=brightdata --cov-report=html -``` - ---- - -## ๐Ÿ›๏ธ Design Philosophy - -- **Client is single source of truth** for configuration -- **Authentication "just works"** with minimal setup -- **Fail fast and clearly** when credentials are missing/invalid -- **Each platform is an expert** in its domain -- **Scrape vs Search distinction** is clear and consistent -- **Build for future** - registry pattern enables intelligent routing - ---- - -## ๐Ÿ“– Documentation - -### Jupyter Notebooks (Interactive) -- [01_quickstart.ipynb](notebooks/01_quickstart.ipynb) - 5-minute getting started -- [02_pandas_integration.ipynb](notebooks/02_pandas_integration.ipynb) - DataFrame workflows -- [03_amazon_scraping.ipynb](notebooks/03_amazon_scraping.ipynb) - Amazon deep dive -- [04_linkedin_jobs.ipynb](notebooks/04_linkedin_jobs.ipynb) - Job market analysis -- [05_batch_processing.ipynb](notebooks/05_batch_processing.ipynb) - Scale to production - -### Code Examples -- [examples/10_pandas_integration.py](examples/10_pandas_integration.py) - Pandas integration -- [examples/01_simple_scrape.py](examples/01_simple_scrape.py) - Basic usage -- [examples/03_batch_scraping.py](examples/03_batch_scraping.py) - Batch operations -- [examples/04_specialized_scrapers.py](examples/04_specialized_scrapers.py) - Platform-specific -- [All examples โ†’](examples/) - -### Documentation -- [API Reference](docs/api-reference/) -- [Contributing Guidelines](https://github.com/brightdata/sdk-python/blob/main/CONTRIBUTING.md) (See upstream repo) - ---- - -## ๐Ÿ”ง Troubleshooting - -### SSL Certificate Errors (macOS) - -If you encounter SSL certificate verification errors, especially on macOS: - -``` -SSL: CERTIFICATE_VERIFY_FAILED -``` - -The SDK will provide helpful, platform-specific guidance. Quick fixes: - -```bash -# Option 1: Upgrade certifi -pip install --upgrade certifi - -# Option 2: Set SSL_CERT_FILE environment variable -export SSL_CERT_FILE=$(python -m certifi) - -# Option 3: Run Install Certificates (macOS with python.org installer) -/Applications/Python\ 3.x/Install\ Certificates.command - -# Option 4: Install via Homebrew (if using Homebrew Python) -brew install ca-certificates -``` - -### Missing Token - -```python -# Error: BRIGHTDATA_API_TOKEN not found in environment - -# Solution 1: Create .env file -echo "BRIGHTDATA_API_TOKEN=your_token" > .env - -# Solution 2: Export environment variable -export BRIGHTDATA_API_TOKEN="your_token" - -# Solution 3: Pass directly to client -client = BrightDataClient(token="your_token") -``` - -### Import Errors - -```bash -# If you get import errors, ensure package is installed -pip install --upgrade brightdata-sdk - -# For development installation -pip install -e . -``` - ---- - -## ๐Ÿค Contributing - -Contributions are welcome! Check the [GitHub repository](https://github.com/brightdata/sdk-python) for contribution guidelines. - -### Development Setup - -```bash -git clone https://github.com/brightdata/sdk-python.git -cd sdk-python - -# Install with dev dependencies -pip install -e ".[dev]" - -# Install pre-commit hooks -pre-commit install - -# Run tests -pytest tests/ -``` - ---- - -## ๐Ÿ“Š Project Stats - -- **Production Code:** ~9,000 lines -- **Test Code:** ~4,000 lines -- **Documentation:** 5 Jupyter notebooks + 10 examples -- **Test Coverage:** 502+ tests passing (Unit, Integration, E2E) -- **Supported Platforms:** Amazon, LinkedIn, ChatGPT, Facebook, Instagram, Generic Web -- **Supported Search Engines:** Google, Bing, Yandex -- **Type Safety:** 100% (Dataclasses + TypedDict) -- **Resource Efficiency:** Single shared AsyncEngine -- **Data Science Ready:** Pandas, tqdm, joblib integration -- **CLI Tool:** Full-featured command-line interface -- **Code Quality:** Enterprise-grade, FAANG standards - ---- - -## ๐Ÿ“ License - -MIT License - see [LICENSE](LICENSE) file for details. - ---- - -## ๐Ÿ”— Links - -- [Bright Data](https://brightdata.com) - Get your API token -- [API Documentation](https://docs.brightdata.com) -- [GitHub Repository](https://github.com/brightdata/sdk-python) -- [Issue Tracker](https://github.com/brightdata/sdk-python/issues) - ---- - -## ๐Ÿ’ก Examples - -### Complete Workflow Example - -```python -from brightdata import BrightDataClient - -# Initialize (auto-loads from .env or environment) +# Wrong - forgot context manager client = BrightDataClient() +result = await client.scrape_url("...") # Error! -# Test connection -if client.test_connection_sync(): - print("โœ… Connected to Bright Data API") - - # Get account info - info = client.get_account_info_sync() - print(f"Active zones: {info['zone_count']}") - - # Scrape Amazon product - product = client.scrape.amazon.products( - url="https://amazon.com/dp/B0CRMZHDG8" - ) - - if product.success: - print(f"Product: {product.data[0]['title']}") - print(f"Price: {product.data[0]['final_price']}") - print(f"Rating: {product.data[0]['rating']}") - print(f"Cost: ${product.cost:.4f}") - - # Search LinkedIn jobs - jobs = client.search.linkedin.jobs( - keyword="python developer", - location="San Francisco", - remote=True - ) - - if jobs.success: - print(f"Found {len(jobs.data)} jobs") - - # Scrape Facebook posts - fb_posts = client.scrape.facebook.posts_by_profile( - url="https://facebook.com/zuck", - num_of_posts=10, - timeout=240 - ) - - if fb_posts.success: - print(f"Scraped {len(fb_posts.data)} Facebook posts") - - # Scrape Instagram profile - ig_profile = client.scrape.instagram.profiles( - url="https://instagram.com/instagram", - timeout=240 - ) - - if ig_profile.success: - print(f"Profile: {ig_profile.data[0]['username']}") - print(f"Followers: {ig_profile.data[0]['followers_count']}") - - # Search Google - search_results = client.search.google( - query="python async tutorial", - location="United States", - num_results=10 - ) - - if search_results.success: - for i, item in enumerate(search_results.data[:5], 1): - print(f"{i}. {item.get('title', 'N/A')}") +# Correct - use context manager +async with BrightDataClient() as client: + result = await client.scrape_url("...") ``` -### Interactive CLI Demo - -Run the included demo to explore the SDK interactively: - -```bash -python demo_sdk.py -``` ---- - -## ๐Ÿ™ Acknowledgments - -Built with best practices from: -- Modern Python packaging (PEP 518, 621) -- Async/await patterns -- Type safety (PEP 484, 544, dataclasses) -- Enterprise-grade engineering standards -- Data science workflows (pandas, jupyter) - -### Built For -- ๐ŸŽ“ **Data Scientists** - Jupyter notebooks, pandas integration, visualization examples -- ๐Ÿ‘จโ€๐Ÿ’ป **Developers** - Type-safe API, comprehensive docs, CLI tool -- ๐Ÿข **Enterprises** - Production-ready, well-tested, resource-efficient - ---- - -## ๐ŸŒŸ Why Choose This SDK? - -- โœ… **Data Scientist Friendly** - 5 Jupyter notebooks, pandas examples, visualization guides -- โœ… **Type Safe** - Dataclass payloads with runtime validation -- โœ… **Enterprise Ready** - 502+ tests, resource efficient, production-proven -- โœ… **Well Documented** - Interactive notebooks + code examples + API docs -- โœ… **Easy to Use** - CLI tool, intuitive API, helpful error messages -- โœ… **Actively Maintained** - Regular updates, bug fixes, new features - ---- - -**Ready to start scraping?** Get your API token at [brightdata.com](https://brightdata.com/cp/api_keys) and try our [quickstart notebook](notebooks/01_quickstart.ipynb)! +## License +MIT License diff --git a/demo_sdk.py b/demo_sdk.py index 30a3997..160e165 100644 --- a/demo_sdk.py +++ b/demo_sdk.py @@ -138,7 +138,7 @@ async def test_connection(): print("Scraping https://httpbin.org/json (test URL)...") try: - result = client.scrape.generic.url("https://httpbin.org/json") + result = client.scrape_url("https://httpbin.org/json") if result.success: print("[OK] Generic scrape successful!") @@ -194,7 +194,7 @@ def test_generic_scrape(): url = url or "https://httpbin.org/html" print(f"\nScraping: {url}") - result = client.scrape.generic.url(url) + result = client.scrape_url(url) if result.success: print(f"[OK] Success!") @@ -494,7 +494,7 @@ def test_batch_scraping(): import time start = time.time() - results = client.scrape.generic.url(urls) + results = client.scrape_url(urls) elapsed = time.time() - start @@ -530,7 +530,7 @@ def test_sync_vs_async(): # Test sync mode print("\n1. Sync mode (immediate response):") start = time.time() - result_sync = client.scrape.generic.url(url) + result_sync = client.scrape_url(url) sync_time = time.time() - start print(f" Time: {sync_time:.2f}s") @@ -562,7 +562,7 @@ def show_complete_interface(): print() print("SCRAPE (URL-based extraction):") - print(" client.scrape.generic.url(url)") + print(" client.scrape_url(url)") print(" client.scrape.amazon.products(url, timeout=240)") print(" client.scrape.amazon.reviews(url, pastDays, keyWord, numOfReviews, timeout=240)") print(" client.scrape.amazon.sellers(url, timeout=240)") @@ -594,7 +594,7 @@ def show_complete_interface(): print("ASYNC USAGE:") print(" async with BrightDataClient() as client:") - print(" result = await client.scrape.generic.url_async(url)") + print(" result = await client.scrape_url(url)") print() # Interactive loop diff --git a/docs/sync_client.md b/docs/sync_client.md new file mode 100644 index 0000000..372f8af --- /dev/null +++ b/docs/sync_client.md @@ -0,0 +1,127 @@ +# Sync Client + +`SyncBrightDataClient` provides a synchronous interface for the Bright Data SDK. Use it when you don't need async/await or for simpler scripts. + +## Basic Usage + +```python +from brightdata import SyncBrightDataClient + +with SyncBrightDataClient() as client: + result = client.scrape_url("https://example.com") + print(result.data) +``` + +## How It Works + +- Wraps the async `BrightDataClient` with a persistent event loop +- All methods have the same signature as the async client (without `await`) +- Uses `run_until_complete()` internally for better performance than repeated `asyncio.run()` calls + +## Available Methods + +### Client Methods + +```python +client.scrape_url(url, **kwargs) # Scrape any URL +client.test_connection() # Test API connection +client.get_account_info() # Get account info +client.list_zones() # List all zones +client.delete_zone(zone_name) # Delete a zone +``` + +### Scrape Service + +```python +# Amazon +client.scrape.amazon.products(url) +client.scrape.amazon.products_trigger(url) +client.scrape.amazon.products_status(snapshot_id) +client.scrape.amazon.products_fetch(snapshot_id) +client.scrape.amazon.reviews(url) +client.scrape.amazon.sellers(url) + +# LinkedIn +client.scrape.linkedin.profiles(url) +client.scrape.linkedin.companies(url) +client.scrape.linkedin.jobs(url) +client.scrape.linkedin.posts(url) + +# Instagram +client.scrape.instagram.profiles(url) +client.scrape.instagram.posts(url) +client.scrape.instagram.comments(url) +client.scrape.instagram.reels(url) + +# Facebook +client.scrape.facebook.posts_by_profile(url) +client.scrape.facebook.posts_by_group(url) +client.scrape.facebook.comments(url) +client.scrape.facebook.reels(url) + +# ChatGPT +client.scrape.chatgpt.prompt(prompt) +client.scrape.chatgpt.prompts(prompts) +``` + +### Search Service + +```python +client.search.google(query) +client.search.bing(query) +client.search.yandex(query) +client.search.amazon.products(keyword) +client.search.linkedin.jobs(keyword) +client.search.linkedin.profiles(**kwargs) +``` + +### Crawler Service + +```python +client.crawler.crawl(url) +client.crawler.scrape(url) +``` + +## Important Notes + +### Not Thread-Safe + +`SyncBrightDataClient` is **not thread-safe**. For multi-threaded usage, create a separate client per thread: + +```python +import threading + +def worker(): + with SyncBrightDataClient() as client: + result = client.scrape_url("https://example.com") + +threads = [threading.Thread(target=worker) for _ in range(3)] +for t in threads: + t.start() +``` + +### Cannot Use Inside Async Context + +Using `SyncBrightDataClient` inside an async function will raise an error: + +```python +# Wrong - will raise RuntimeError +async def main(): + with SyncBrightDataClient() as client: # Error! + ... + +# Correct - use async client +async def main(): + async with BrightDataClient() as client: + result = await client.scrape_url("...") +``` + +## When to Use Sync vs Async + +| Use Case | Recommended | +|----------|-------------| +| Simple scripts | `SyncBrightDataClient` | +| Jupyter notebooks | `SyncBrightDataClient` | +| Web frameworks (FastAPI, etc.) | `BrightDataClient` (async) | +| High-volume scraping | `BrightDataClient` (async) | +| Concurrent requests | `BrightDataClient` (async) | diff --git a/examples/11_trigger_interface.py b/examples/11_trigger_interface.py index 798019f..844a981 100644 --- a/examples/11_trigger_interface.py +++ b/examples/11_trigger_interface.py @@ -34,28 +34,28 @@ async def example_basic_trigger(): # Step 1: Trigger the scrape (returns immediately) print("\n๐Ÿš€ Triggering Amazon product scrape...") - job = await amazon.products_trigger_async( + job = await amazon.products_trigger( url="https://www.amazon.com/dp/B0CRMZHDG8" ) print(f"โœ… Job triggered: {job.snapshot_id}") # Step 2: Check status manually print("\n๐Ÿ” Checking job status...") - status = await job.status_async() + status = await job.status() print(f"Status: {status}") # Step 3: Wait for completion (with custom timeout) print("\nโณ Waiting for completion...") - await job.wait_async(timeout=180, verbose=True) + await job.wait(timeout=180, verbose=True) # Step 4: Fetch results print("\n๐Ÿ“ฅ Fetching results...") - data = await job.fetch_async() + data = await job.fetch() print(f"โœ… Got {len(data) if isinstance(data, list) else 1} records") # Or use convenience method (wait + fetch + wrap in ScrapeResult) print("\n๐Ÿ’ก Alternative: Use to_result_async()...") - result = await job.to_result_async() + result = await job.to_result() print(f"Success: {result.success}") print(f"Cost: ${result.cost:.4f}") @@ -85,7 +85,7 @@ async def example_concurrent_scraping(): print("\n๐Ÿš€ Triggering multiple scrapes...") jobs = [] for i, url in enumerate(urls, 1): - job = await amazon.products_trigger_async(url=url) + job = await amazon.products_trigger(url=url) jobs.append(job) print(f" [{i}/{len(urls)}] Triggered: {job.snapshot_id[:12]}...") @@ -96,7 +96,7 @@ async def example_concurrent_scraping(): results = [] for i, job in enumerate(jobs, 1): print(f" [{i}/{len(jobs)}] Waiting for job {job.snapshot_id[:12]}...") - result = await job.to_result_async(timeout=180) + result = await job.to_result(timeout=180) results.append(result) # Step 3: Process all results @@ -124,7 +124,7 @@ async def example_custom_polling(): # Trigger the scrape print("\n๐Ÿš€ Triggering scrape...") - job = await amazon.products_trigger_async( + job = await amazon.products_trigger( url="https://www.amazon.com/dp/B0CRMZHDG8" ) print(f"โœ… Job ID: {job.snapshot_id}") @@ -136,14 +136,14 @@ async def example_custom_polling(): max_attempts = 30 for attempt in range(max_attempts): - status = await job.status_async() + status = await job.status() elapsed = time.time() - job.triggered_at.timestamp() print(f" [{elapsed:.1f}s] Attempt {attempt + 1}: {status}") if status == "ready": print("โœ… Job completed!") - data = await job.fetch_async() + data = await job.fetch() print(f"๐Ÿ“ฅ Got {len(data) if isinstance(data, list) else 1} records") break elif status == "error": @@ -173,7 +173,7 @@ async def example_save_and_resume(): # Phase 1: Trigger and save job ID print("\n๐Ÿ“ Phase 1: Trigger and save job ID...") - job = await amazon.products_trigger_async( + job = await amazon.products_trigger( url="https://www.amazon.com/dp/B0CRMZHDG8" ) snapshot_id = job.snapshot_id @@ -189,12 +189,12 @@ async def example_save_and_resume(): print(f"๐Ÿ“‚ Loading snapshot_id: {snapshot_id}") # Check status using the snapshot_id directly - status = await amazon.products_status_async(snapshot_id) + status = await amazon.products_status(snapshot_id) print(f"Status: {status}") # Fetch if ready if status == "ready": - data = await amazon.products_fetch_async(snapshot_id) + data = await amazon.products_fetch(snapshot_id) print(f"โœ… Fetched {len(data) if isinstance(data, list) else 1} records") else: print("โณ Job not ready yet, would need to wait longer...") diff --git a/src/brightdata/__init__.py b/src/brightdata/__init__.py index 1201822..171b593 100644 --- a/src/brightdata/__init__.py +++ b/src/brightdata/__init__.py @@ -2,8 +2,11 @@ __version__ = "2.0.0" -# Export main client -from .client import BrightDataClient, BrightData # BrightData is alias for backward compat +# Export main client (async) +from .client import BrightDataClient + +# Export sync client adapter +from .sync_client import SyncBrightDataClient # Export result models from .models import ( @@ -69,9 +72,10 @@ __all__ = [ "__version__", - # Main client + # Main client (async) "BrightDataClient", - "BrightData", # Backward compatibility alias + # Sync client adapter + "SyncBrightDataClient", # Result models "BaseResult", "ScrapeResult", diff --git a/src/brightdata/api/base.py b/src/brightdata/api/base.py index f99fa15..31cc7be 100644 --- a/src/brightdata/api/base.py +++ b/src/brightdata/api/base.py @@ -1,6 +1,5 @@ """Base API class for all API implementations.""" -import asyncio from abc import ABC, abstractmethod from typing import Any from ..core.engine import AsyncEngine @@ -10,8 +9,8 @@ class BaseAPI(ABC): """ Base class for all API implementations. - Provides common structure and async/sync wrapper pattern - for all API service classes. + Provides common structure for all API service classes. + All methods are async-only. For sync usage, use SyncBrightDataClient. """ def __init__(self, engine: AsyncEngine): @@ -32,23 +31,3 @@ async def _execute_async(self, *args: Any, **kwargs: Any) -> Any: the actual async API operation. """ pass - - def _execute_sync(self, *args: Any, **kwargs: Any) -> Any: - """ - Execute API operation synchronously. - - Wraps async method using asyncio.run() for sync compatibility. - Properly manages engine context. - """ - try: - asyncio.get_running_loop() - raise RuntimeError( - "Cannot call sync method from async context. Use async method instead." - ) - except RuntimeError: - - async def _run(): - async with self.engine: - return await self._execute_async(*args, **kwargs) - - return asyncio.run(_run()) diff --git a/src/brightdata/api/scrape_service.py b/src/brightdata/api/scrape_service.py index 7b367be..86d721b 100644 --- a/src/brightdata/api/scrape_service.py +++ b/src/brightdata/api/scrape_service.py @@ -2,9 +2,9 @@ Scraping service namespace. Provides hierarchical access to specialized scrapers and generic scraping. +All methods are async-only. For sync usage, use SyncBrightDataClient. """ -import asyncio from typing import Union, List, TYPE_CHECKING from ..models import ScrapeResult @@ -28,7 +28,6 @@ def __init__(self, client: "BrightDataClient"): self._chatgpt = None self._facebook = None self._instagram = None - self._generic = None @property def amazon(self): @@ -184,39 +183,4 @@ def instagram(self): ) return self._instagram - @property - def generic(self): - """Access generic web scraper (Web Unlocker).""" - if self._generic is None: - self._generic = GenericScraper(self._client) - return self._generic - - -class GenericScraper: - """Generic web scraper using Web Unlocker API.""" - - def __init__(self, client: "BrightDataClient"): - """Initialize generic scraper.""" - self._client = client - async def url_async( - self, - url: Union[str, List[str]], - country: str = "", - response_format: str = "raw", - ) -> Union[ScrapeResult, List[ScrapeResult]]: - """Scrape URL(s) asynchronously.""" - return await self._client.scrape_url_async( - url=url, - country=country, - response_format=response_format, - ) - - def url(self, *args, **kwargs) -> Union[ScrapeResult, List[ScrapeResult]]: - """Scrape URL(s) synchronously.""" - - async def _run(): - async with self._client.engine: - return await self.url_async(*args, **kwargs) - - return asyncio.run(_run()) diff --git a/src/brightdata/api/search_service.py b/src/brightdata/api/search_service.py index d9d15ae..6149919 100644 --- a/src/brightdata/api/search_service.py +++ b/src/brightdata/api/search_service.py @@ -3,9 +3,9 @@ Provides access to search engine result scrapers with normalized data across different search engines. +All methods are async-only. For sync usage, use SyncBrightDataClient. """ -import asyncio from typing import Optional, Union, List, TYPE_CHECKING from ..models import SearchResult @@ -51,7 +51,7 @@ def __init__(self, client: "BrightDataClient"): self._chatgpt_search: Optional["ChatGPTSearchService"] = None self._instagram_search: Optional["InstagramSearchScraper"] = None - async def google_async( + async def google( self, query: Union[str, List[str]], location: Optional[str] = None, @@ -77,11 +77,12 @@ async def google_async( SearchResult with normalized Google search data Example: - >>> result = await client.search.google_async( - ... query="python tutorial", - ... location="United States", - ... num_results=20 - ... ) + >>> async with BrightDataClient() as client: + ... result = await client.search.google( + ... query="python tutorial", + ... location="United States", + ... num_results=20 + ... ) """ from .serp import GoogleSERPService @@ -92,7 +93,7 @@ async def google_async( ) zone = zone or self._client.serp_zone - return await self._google_service.search_async( + return await self._google_service.search( query=query, zone=zone, location=location, @@ -102,28 +103,8 @@ async def google_async( **kwargs, ) - def google( - self, query: Union[str, List[str]], **kwargs - ) -> Union[SearchResult, List[SearchResult]]: - """ - Search Google synchronously. - - See google_async() for full documentation. - - Example: - >>> result = client.search.google( - ... query="python tutorial", - ... location="United States" - ... ) - """ - - async def _run(): - async with self._client.engine: - return await self.google_async(query, **kwargs) - return asyncio.run(_run()) - - async def bing_async( + async def bing( self, query: Union[str, List[str]], location: Optional[str] = None, @@ -142,7 +123,7 @@ async def bing_async( ) zone = zone or self._client.serp_zone - return await self._bing_service.search_async( + return await self._bing_service.search( query=query, zone=zone, location=location, @@ -151,16 +132,8 @@ async def bing_async( **kwargs, ) - def bing(self, query: Union[str, List[str]], **kwargs): - """Search Bing synchronously.""" - - async def _run(): - async with self._client.engine: - return await self.bing_async(query, **kwargs) - return asyncio.run(_run()) - - async def yandex_async( + async def yandex( self, query: Union[str, List[str]], location: Optional[str] = None, @@ -179,7 +152,7 @@ async def yandex_async( ) zone = zone or self._client.serp_zone - return await self._yandex_service.search_async( + return await self._yandex_service.search( query=query, zone=zone, location=location, @@ -188,14 +161,6 @@ async def yandex_async( **kwargs, ) - def yandex(self, query: Union[str, List[str]], **kwargs): - """Search Yandex synchronously.""" - - async def _run(): - async with self._client.engine: - return await self.yandex_async(query, **kwargs) - - return asyncio.run(_run()) @property def amazon(self): diff --git a/src/brightdata/api/serp/base.py b/src/brightdata/api/serp/base.py index f844fe9..2db47d8 100644 --- a/src/brightdata/api/serp/base.py +++ b/src/brightdata/api/serp/base.py @@ -53,7 +53,7 @@ def __init__( self.timeout = timeout or self.DEFAULT_TIMEOUT self.max_retries = max_retries - async def search_async( + async def search( self, query: Union[str, List[str]], zone: str, @@ -77,6 +77,11 @@ async def search_async( Returns: SearchResult for single query, List[SearchResult] for multiple + + Note: + For synchronous usage, use SyncBrightDataClient instead: + >>> with SyncBrightDataClient() as client: + ... result = client.search.google(query) """ is_single = isinstance(query, str) query_list = [query] if is_single else query @@ -106,9 +111,6 @@ async def search_async( **kwargs, ) - def search(self, *args, **kwargs): - """Synchronous search wrapper.""" - return asyncio.run(self.search_async(*args, **kwargs)) async def _search_single_async( self, diff --git a/src/brightdata/api/web_unlocker.py b/src/brightdata/api/web_unlocker.py index 6e53875..fd7355e 100644 --- a/src/brightdata/api/web_unlocker.py +++ b/src/brightdata/api/web_unlocker.py @@ -1,4 +1,7 @@ -"""Web Unlocker API - High-level service wrapper for Bright Data's Web Unlocker proxy service.""" +"""Web Unlocker API - High-level service wrapper for Bright Data's Web Unlocker proxy service. + +All methods are async-only. For sync usage, use SyncBrightDataClient. +""" from typing import Union, List, Optional, Dict, Any from datetime import datetime, timezone @@ -224,34 +227,4 @@ async def _scrape_multiple_async( return processed_results - def scrape( - self, - url: Union[str, List[str]], - zone: str, - country: str = "", - response_format: str = "raw", - method: str = "GET", - timeout: Optional[int] = None, - ) -> Union[ScrapeResult, List[ScrapeResult]]: - """ - Scrape URL(s) synchronously. - - Args: - url: Single URL string or list of URLs to scrape. - zone: Bright Data zone identifier. - country: Two-letter ISO country code for proxy location (optional). - response_format: Response format - "json" for structured data, "raw" for HTML string. - method: HTTP method for the request (default: "GET"). - timeout: Request timeout in seconds. - - Returns: - ScrapeResult for single URL, or List[ScrapeResult] for multiple URLs. - """ - return self._execute_sync( - url=url, - zone=zone, - country=country, - response_format=response_format, - method=method, - timeout=timeout, - ) + scrape = scrape_async diff --git a/src/brightdata/cli/commands/scrape.py b/src/brightdata/cli/commands/scrape.py index 0cab2f8..3b01836 100644 --- a/src/brightdata/cli/commands/scrape.py +++ b/src/brightdata/cli/commands/scrape.py @@ -41,16 +41,16 @@ def scrape_group( # ============================================================================ -@scrape_group.command("generic") +@scrape_group.command("url") @click.argument("url", required=True) @click.option("--country", default="", help="Country code for targeting") @click.option("--response-format", default="raw", help="Response format (raw, json)") @click.pass_context -def scrape_generic(ctx: click.Context, url: str, country: str, response_format: str) -> None: - """Scrape any URL using generic web scraper.""" +def scrape_url(ctx: click.Context, url: str, country: str, response_format: str) -> None: + """Scrape any URL using Web Unlocker.""" try: client = create_client(ctx.obj["api_key"]) - result = client.scrape.generic.url( + result = client.scrape_url( url=url, country=country, response_format=response_format ) output_result(result, ctx.obj["output_format"], ctx.obj["output_file"]) diff --git a/src/brightdata/client.py b/src/brightdata/client.py index 5cfa222..164886f 100644 --- a/src/brightdata/client.py +++ b/src/brightdata/client.py @@ -139,8 +139,21 @@ def __init__( self._account_info: Optional[Dict[str, Any]] = None self._zones_ensured = False - if validate_token: - self._validate_token_sync() + # Store for validation during __aenter__ + self._validate_token_on_enter = validate_token + + def _ensure_initialized(self) -> None: + """ + Ensure client is properly initialized (used as context manager). + + Raises: + RuntimeError: If client not initialized via context manager + """ + if self.engine._session is None: + raise RuntimeError( + "BrightDataClient not initialized. " + "Use: async with BrightDataClient() as client: ..." + ) def _load_token(self, token: Optional[str]) -> str: """ @@ -179,28 +192,6 @@ def _load_token(self, token: Optional[str]) -> str: f"Get your API token from: https://brightdata.com/cp/api_keys" ) - def _validate_token_sync(self) -> None: - """ - Validate token synchronously during initialization. - - Raises: - AuthenticationError: If token is invalid - """ - try: - is_valid = asyncio.run(self.test_connection()) - if not is_valid: - raise AuthenticationError( - "Token validation failed. Token appears to be invalid.\n" - "Check your token at: https://brightdata.com/cp/api_keys" - ) - except AuthenticationError: - raise - except Exception as e: - raise AuthenticationError( - f"Failed to validate token: {str(e)}\n" - f"Check your token at: https://brightdata.com/cp/api_keys" - ) - async def _ensure_zones(self) -> None: """ Ensure required zones exist if auto_create_zones is enabled. @@ -235,7 +226,7 @@ def scrape(self) -> ScrapeService: Provides hierarchical access to specialized scrapers: - client.scrape.amazon.products(...) - client.scrape.linkedin.profiles(...) - - client.scrape.generic.url(...) + - client.scrape_url(...) Returns: ScrapeService instance for accessing scrapers @@ -311,24 +302,25 @@ async def test_connection(self) -> bool: (invalid token, network issues, etc.). This makes it safe for testing connectivity without exception handling. + Client must be used as context manager before calling this method. + Example: - >>> is_valid = await client.test_connection() - >>> if is_valid: - ... print("Connected successfully!") - >>> else: - ... print("Connection failed") + >>> async with BrightDataClient() as client: + ... is_valid = await client.test_connection() + ... if is_valid: + ... print("Connected successfully!") """ + self._ensure_initialized() try: - async with self.engine: - async with self.engine.get_from_url( - f"{self.engine.BASE_URL}/zone/get_active_zones" - ) as response: - if response.status == HTTP_OK: - self._is_connected = True - return True - else: - self._is_connected = False - return False + async with self.engine.get_from_url( + f"{self.engine.BASE_URL}/zone/get_active_zones" + ) as response: + if response.status == HTTP_OK: + self._is_connected = True + return True + else: + self._is_connected = False + return False except (asyncio.TimeoutError, OSError, Exception): self._is_connected = False @@ -373,102 +365,55 @@ async def get_account_info(self, refresh: bool = False) -> AccountInfo: if self._account_info is not None and not refresh: return self._account_info + self._ensure_initialized() try: - # Engine context manager is idempotent, safe to enter multiple times - async with self.engine: - async with self.engine.get_from_url( - f"{self.engine.BASE_URL}/zone/get_active_zones" - ) as zones_response: - if zones_response.status == HTTP_OK: - zones = await zones_response.json() - zones = zones or [] - - # Warn user if no active zones found (they might be inactive) - if not zones: - warnings.warn( - "No active zones found. This could mean:\n" - "1. Your zones might be inactive - activate them in the Bright Data dashboard\n" - "2. You might need to create zones first\n" - "3. Check your dashboard at https://brightdata.com for zone status\n\n" - "Note: The API only returns active zones. Inactive zones won't appear here.", - UserWarning, - stacklevel=2, - ) - - account_info = { - "customer_id": self.customer_id, - "zones": zones, - "zone_count": len(zones), - "token_valid": True, - "retrieved_at": datetime.now(timezone.utc).isoformat(), - } - - self._account_info = account_info - return account_info - - elif zones_response.status in (HTTP_UNAUTHORIZED, HTTP_FORBIDDEN): - error_text = await zones_response.text() - raise AuthenticationError( - f"Invalid token (HTTP {zones_response.status}): {error_text}" - ) - else: - error_text = await zones_response.text() - raise APIError( - f"Failed to get account info (HTTP {zones_response.status}): {error_text}", - status_code=zones_response.status, + async with self.engine.get_from_url( + f"{self.engine.BASE_URL}/zone/get_active_zones" + ) as zones_response: + if zones_response.status == HTTP_OK: + zones = await zones_response.json() + zones = zones or [] + + # Warn user if no active zones found (they might be inactive) + if not zones: + warnings.warn( + "No active zones found. This could mean:\n" + "1. Your zones might be inactive - activate them in the Bright Data dashboard\n" + "2. You might need to create zones first\n" + "3. Check your dashboard at https://brightdata.com for zone status\n\n" + "Note: The API only returns active zones. Inactive zones won't appear here.", + UserWarning, + stacklevel=2, ) + account_info = { + "customer_id": self.customer_id, + "zones": zones, + "zone_count": len(zones), + "token_valid": True, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + + self._account_info = account_info + return account_info + + elif zones_response.status in (HTTP_UNAUTHORIZED, HTTP_FORBIDDEN): + error_text = await zones_response.text() + raise AuthenticationError( + f"Invalid token (HTTP {zones_response.status}): {error_text}" + ) + else: + error_text = await zones_response.text() + raise APIError( + f"Failed to get account info (HTTP {zones_response.status}): {error_text}", + status_code=zones_response.status, + ) + except (AuthenticationError, APIError): raise except Exception as e: raise APIError(f"Unexpected error getting account info: {str(e)}") - def _run_async_with_cleanup(self, coro): - """ - Run an async coroutine with proper cleanup. - - This helper ensures that the event loop stays open long enough - for all sessions and connectors to close properly, preventing - "Unclosed client session" warnings. - """ - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - try: - result = loop.run_until_complete(coro) - # Give pending tasks and cleanup handlers time to complete - # This is crucial for aiohttp session cleanup - loop.run_until_complete(asyncio.sleep(0.25)) - return result - finally: - try: - # Cancel any remaining tasks - pending = asyncio.all_tasks(loop) - for task in pending: - task.cancel() - # Run the loop once more to process cancellations - if pending: - loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True)) - # Final sleep to ensure all cleanup completes - loop.run_until_complete(asyncio.sleep(0.1)) - finally: - loop.close() - - def get_account_info_sync(self, refresh: bool = False) -> AccountInfo: - """ - Synchronous version of get_account_info(). - - Args: - refresh: If True, bypass cache and fetch fresh data (default: False) - """ - return self._run_async_with_cleanup(self.get_account_info(refresh=refresh)) - - def test_connection_sync(self) -> bool: - """Synchronous version of test_connection().""" - try: - return self._run_async_with_cleanup(self.test_connection()) - except Exception: - return False - async def list_zones(self) -> List[Dict[str, Any]]: """ List all active zones in your Bright Data account. @@ -481,15 +426,16 @@ async def list_zones(self) -> List[Dict[str, Any]]: AuthenticationError: If authentication fails Example: - >>> zones = await client.list_zones() - >>> print(f"Found {len(zones)} zones") - >>> for zone in zones: - ... print(f" - {zone['name']}: {zone.get('type', 'unknown')}") + >>> async with BrightDataClient() as client: + ... zones = await client.list_zones() + ... print(f"Found {len(zones)} zones") + ... for zone in zones: + ... print(f" - {zone['name']}: {zone.get('type', 'unknown')}") """ - async with self.engine: - if self._zone_manager is None: - self._zone_manager = ZoneManager(self.engine) - return await self._zone_manager.list_zones() + self._ensure_initialized() + if self._zone_manager is None: + self._zone_manager = ZoneManager(self.engine) + return await self._zone_manager.list_zones() async def delete_zone(self, zone_name: str) -> None: """ @@ -514,20 +460,12 @@ async def delete_zone(self, zone_name: str) -> None: ... except ZoneError as e: ... print(f"Failed to delete zone: {e}") """ - async with self.engine: - if self._zone_manager is None: - self._zone_manager = ZoneManager(self.engine) - await self._zone_manager.delete_zone(zone_name) - - def list_zones_sync(self) -> List[Dict[str, Any]]: - """Synchronous version of list_zones().""" - return self._run_async_with_cleanup(self.list_zones()) - - def delete_zone_sync(self, zone_name: str) -> None: - """Synchronous version of delete_zone().""" - return self._run_async_with_cleanup(self.delete_zone(zone_name)) + self._ensure_initialized() + if self._zone_manager is None: + self._zone_manager = ZoneManager(self.engine) + await self._zone_manager.delete_zone(zone_name) - async def scrape_url_async( + async def scrape_url( self, url: Union[str, List[str]], zone: Optional[str] = None, @@ -540,29 +478,36 @@ async def scrape_url_async( Direct scraping method (flat API). For backward compatibility. Prefer using hierarchical API: - client.scrape.generic.url(...) for new code. + client.scrape_url(...) for new code. """ - async with self.engine: - if self._web_unlocker_service is None: - self._web_unlocker_service = WebUnlockerService(self.engine) - - zone = zone or self.web_unlocker_zone - return await self._web_unlocker_service.scrape_async( - url=url, - zone=zone, - country=country, - response_format=response_format, - method=method, - timeout=timeout, - ) + self._ensure_initialized() + if self._web_unlocker_service is None: + self._web_unlocker_service = WebUnlockerService(self.engine) + + zone = zone or self.web_unlocker_zone + return await self._web_unlocker_service.scrape_async( + url=url, + zone=zone, + country=country, + response_format=response_format, + method=method, + timeout=timeout, + ) - def scrape_url(self, *args, **kwargs) -> Union[ScrapeResult, List[ScrapeResult]]: - """Synchronous version of scrape_url_async().""" - return asyncio.run(self.scrape_url_async(*args, **kwargs)) async def __aenter__(self): """Async context manager entry.""" await self.engine.__aenter__() + + # Validate token if requested + if self._validate_token_on_enter: + is_valid = await self.test_connection() + if not is_valid: + await self.engine.__aexit__(None, None, None) + raise AuthenticationError( + "Token validation failed. Please check your API token." + ) + await self._ensure_zones() return self @@ -577,4 +522,3 @@ def __repr__(self) -> str: return f"" -BrightData = BrightDataClient diff --git a/src/brightdata/core/engine.py b/src/brightdata/core/engine.py index ce7f35a..6f72949 100644 --- a/src/brightdata/core/engine.py +++ b/src/brightdata/core/engine.py @@ -22,6 +22,9 @@ # resource tracking may still emit warnings during rapid create/destroy cycles warnings.filterwarnings("ignore", category=ResourceWarning, message="unclosed.* Union[ScrapeResult, List[ScrapeResult]]: """ - Scrape Amazon products from URLs (async). + Scrape Amazon products from URLs. Uses standard async workflow: trigger job, poll until ready, then fetch results. @@ -72,10 +76,11 @@ async def products_async( ScrapeResult or List[ScrapeResult] with product data Example: - >>> result = await scraper.products_async( - ... url="https://amazon.com/dp/B0CRMZHDG8", - ... timeout=240 - ... ) + >>> async with AmazonScraper(token="...") as scraper: + ... result = await scraper.products( + ... url="https://amazon.com/dp/B0CRMZHDG8", + ... timeout=240 + ... ) """ # Validate URLs if isinstance(url, str): @@ -85,39 +90,32 @@ async def products_async( return await self._scrape_urls(url=url, dataset_id=self.DATASET_ID, timeout=timeout) - def products( + + def products_sync( self, url: Union[str, List[str]], timeout: int = DEFAULT_TIMEOUT_MEDIUM, ) -> Union[ScrapeResult, List[ScrapeResult]]: """ - Scrape Amazon products (sync wrapper). + Scrape Amazon products from URLs (sync version). - See products_async() for documentation. - - Example: - >>> result = scraper.products( - ... url="https://amazon.com/dp/B123", - ... timeout=240 - ... ) + See products() for full documentation. """ - async def _run(): async with self.engine: - return await self.products_async(url, timeout=timeout) - + return await self.products(url, timeout) return asyncio.run(_run()) # ============================================================================ # PRODUCTS TRIGGER/STATUS/FETCH (Manual Control) # ============================================================================ - async def products_trigger_async( + async def products_trigger( self, url: Union[str, List[str]], ) -> ScrapeJob: """ - Trigger Amazon products scrape (async - manual control). + Trigger Amazon products scrape (manual control). Starts a scrape operation and returns immediately with a Job object. Use the Job to check status and fetch results when ready. @@ -129,30 +127,26 @@ async def products_trigger_async( ScrapeJob object for status checking and result fetching Example: - >>> # Trigger and manual control - >>> job = await scraper.products_trigger_async("https://amazon.com/dp/B123") - >>> print(f"Job ID: {job.snapshot_id}") - >>> - >>> # Check status later - >>> status = await job.status_async() - >>> if status == "ready": - ... data = await job.fetch_async() + >>> async with AmazonScraper(token="...") as scraper: + ... job = await scraper.products_trigger("https://amazon.com/dp/B123") + ... print(f"Job ID: {job.snapshot_id}") + ... status = await job.status() + ... if status == "ready": + ... data = await job.fetch() """ sdk_function = get_caller_function_name() return await self._trigger_scrape_async( urls=url, sdk_function=sdk_function or "products_trigger" ) - def products_trigger( - self, - url: Union[str, List[str]], - ) -> ScrapeJob: - """Trigger Amazon products scrape (sync wrapper).""" - return asyncio.run(self.products_trigger_async(url)) - async def products_status_async(self, snapshot_id: str) -> str: + def products_trigger_sync(self, url: Union[str, List[str]]) -> ScrapeJob: + """Trigger Amazon products scrape (sync version).""" + return asyncio.run(self.products_trigger(url)) + + async def products_status(self, snapshot_id: str) -> str: """ - Check Amazon products scrape status (async). + Check Amazon products scrape status. Args: snapshot_id: Snapshot ID from trigger operation @@ -161,17 +155,18 @@ async def products_status_async(self, snapshot_id: str) -> str: Status string: "ready", "in_progress", "error" Example: - >>> status = await scraper.products_status_async(snapshot_id) + >>> status = await scraper.products_status(snapshot_id) """ return await self._check_status_async(snapshot_id) - def products_status(self, snapshot_id: str) -> str: - """Check Amazon products scrape status (sync wrapper).""" - return asyncio.run(self.products_status_async(snapshot_id)) - async def products_fetch_async(self, snapshot_id: str) -> Any: + def products_status_sync(self, snapshot_id: str) -> str: + """Check Amazon products scrape status (sync version).""" + return asyncio.run(self.products_status(snapshot_id)) + + async def products_fetch(self, snapshot_id: str) -> Any: """ - Fetch Amazon products scrape results (async). + Fetch Amazon products scrape results. Args: snapshot_id: Snapshot ID from trigger operation @@ -180,19 +175,20 @@ async def products_fetch_async(self, snapshot_id: str) -> Any: Product data Example: - >>> data = await scraper.products_fetch_async(snapshot_id) + >>> data = await scraper.products_fetch(snapshot_id) """ return await self._fetch_results_async(snapshot_id) - def products_fetch(self, snapshot_id: str) -> Any: - """Fetch Amazon products scrape results (sync wrapper).""" - return asyncio.run(self.products_fetch_async(snapshot_id)) + + def products_fetch_sync(self, snapshot_id: str) -> Any: + """Fetch Amazon products scrape results (sync version).""" + return asyncio.run(self.products_fetch(snapshot_id)) # ============================================================================ # REVIEWS EXTRACTION (URL-based with filters) # ============================================================================ - async def reviews_async( + async def reviews( self, url: Union[str, List[str]], pastDays: Optional[int] = None, @@ -201,7 +197,7 @@ async def reviews_async( timeout: int = DEFAULT_TIMEOUT_MEDIUM, ) -> Union[ScrapeResult, List[ScrapeResult]]: """ - Scrape Amazon product reviews from URLs (async). + Scrape Amazon product reviews from URLs. Uses standard async workflow: trigger job, poll until ready, then fetch results. @@ -216,13 +212,14 @@ async def reviews_async( ScrapeResult or List[ScrapeResult] with reviews data Example: - >>> result = await scraper.reviews_async( - ... url="https://amazon.com/dp/B123", - ... pastDays=30, - ... keyWord="quality", - ... numOfReviews=100, - ... timeout=240 - ... ) + >>> async with AmazonScraper(token="...") as scraper: + ... result = await scraper.reviews( + ... url="https://amazon.com/dp/B123", + ... pastDays=30, + ... keyWord="quality", + ... numOfReviews=100, + ... timeout=240 + ... ) """ # Validate URLs if isinstance(url, str): @@ -279,7 +276,8 @@ async def reviews_async( return results return result - def reviews( + + def reviews_sync( self, url: Union[str, List[str]], pastDays: Optional[int] = None, @@ -288,30 +286,20 @@ def reviews( timeout: int = DEFAULT_TIMEOUT_MEDIUM, ) -> Union[ScrapeResult, List[ScrapeResult]]: """ - Scrape Amazon reviews (sync wrapper). - - See reviews_async() for documentation. + Scrape Amazon product reviews from URLs (sync version). - Example: - >>> result = scraper.reviews( - ... url="https://amazon.com/dp/B123", - ... pastDays=7, - ... numOfReviews=50, - ... timeout=240 - ... ) + See reviews() for full documentation. """ - async def _run(): async with self.engine: - return await self.reviews_async(url, pastDays, keyWord, numOfReviews, timeout) - + return await self.reviews(url, pastDays, keyWord, numOfReviews, timeout) return asyncio.run(_run()) # ============================================================================ # REVIEWS TRIGGER/STATUS/FETCH (Manual Control) # ============================================================================ - async def reviews_trigger_async( + async def reviews_trigger( self, url: Union[str, List[str]], pastDays: Optional[int] = None, @@ -319,7 +307,7 @@ async def reviews_trigger_async( numOfReviews: Optional[int] = None, ) -> ScrapeJob: """ - Trigger Amazon reviews scrape (async - manual control). + Trigger Amazon reviews scrape (manual control). Starts a scrape operation and returns immediately with a Job object. @@ -333,9 +321,9 @@ async def reviews_trigger_async( ScrapeJob object for status checking and result fetching Example: - >>> job = await scraper.reviews_trigger_async("https://amazon.com/dp/B123", pastDays=30) - >>> status = await job.status_async() - >>> data = await job.fetch_async() + >>> job = await scraper.reviews_trigger("https://amazon.com/dp/B123", pastDays=30) + >>> status = await job.status() + >>> data = await job.fetch() """ sdk_function = get_caller_function_name() return await self._trigger_scrape_async( @@ -344,43 +332,46 @@ async def reviews_trigger_async( sdk_function=sdk_function or "reviews_trigger", ) - def reviews_trigger( + + def reviews_trigger_sync( self, url: Union[str, List[str]], pastDays: Optional[int] = None, keyWord: Optional[str] = None, numOfReviews: Optional[int] = None, ) -> ScrapeJob: - """Trigger Amazon reviews scrape (sync wrapper).""" - return asyncio.run(self.reviews_trigger_async(url, pastDays, keyWord, numOfReviews)) + """Trigger Amazon reviews scrape (sync version).""" + return asyncio.run(self.reviews_trigger(url, pastDays, keyWord, numOfReviews)) - async def reviews_status_async(self, snapshot_id: str) -> str: - """Check Amazon reviews scrape status (async).""" + async def reviews_status(self, snapshot_id: str) -> str: + """Check Amazon reviews scrape status.""" return await self._check_status_async(snapshot_id) - def reviews_status(self, snapshot_id: str) -> str: - """Check Amazon reviews scrape status (sync wrapper).""" - return asyncio.run(self.reviews_status_async(snapshot_id)) - async def reviews_fetch_async(self, snapshot_id: str) -> Any: - """Fetch Amazon reviews scrape results (async).""" + def reviews_status_sync(self, snapshot_id: str) -> str: + """Check Amazon reviews scrape status (sync version).""" + return asyncio.run(self.reviews_status(snapshot_id)) + + async def reviews_fetch(self, snapshot_id: str) -> Any: + """Fetch Amazon reviews scrape results.""" return await self._fetch_results_async(snapshot_id) - def reviews_fetch(self, snapshot_id: str) -> Any: - """Fetch Amazon reviews scrape results (sync wrapper).""" - return asyncio.run(self.reviews_fetch_async(snapshot_id)) + + def reviews_fetch_sync(self, snapshot_id: str) -> Any: + """Fetch Amazon reviews scrape results (sync version).""" + return asyncio.run(self.reviews_fetch(snapshot_id)) # ============================================================================ # SELLERS EXTRACTION (URL-based) # ============================================================================ - async def sellers_async( + async def sellers( self, url: Union[str, List[str]], timeout: int = DEFAULT_TIMEOUT_MEDIUM, ) -> Union[ScrapeResult, List[ScrapeResult]]: """ - Scrape Amazon seller information from URLs (async). + Scrape Amazon seller information from URLs. Uses standard async workflow: trigger job, poll until ready, then fetch results. @@ -392,10 +383,11 @@ async def sellers_async( ScrapeResult or List[ScrapeResult] with seller data Example: - >>> result = await scraper.sellers_async( - ... url="https://amazon.com/sp?seller=AXXXXXXXXXXX", - ... timeout=240 - ... ) + >>> async with AmazonScraper(token="...") as scraper: + ... result = await scraper.sellers( + ... url="https://amazon.com/sp?seller=AXXXXXXXXXXX", + ... timeout=240 + ... ) """ # Validate URLs if isinstance(url, str): @@ -405,33 +397,32 @@ async def sellers_async( return await self._scrape_urls(url=url, dataset_id=self.DATASET_ID_SELLERS, timeout=timeout) - def sellers( + + def sellers_sync( self, url: Union[str, List[str]], timeout: int = DEFAULT_TIMEOUT_MEDIUM, ) -> Union[ScrapeResult, List[ScrapeResult]]: """ - Scrape Amazon sellers (sync wrapper). + Scrape Amazon seller information from URLs (sync version). - See sellers_async() for documentation. + See sellers() for full documentation. """ - async def _run(): async with self.engine: - return await self.sellers_async(url, timeout) - + return await self.sellers(url, timeout) return asyncio.run(_run()) # ============================================================================ # SELLERS TRIGGER/STATUS/FETCH (Manual Control) # ============================================================================ - async def sellers_trigger_async( + async def sellers_trigger( self, url: Union[str, List[str]], ) -> ScrapeJob: """ - Trigger Amazon sellers scrape (async - manual control). + Trigger Amazon sellers scrape (manual control). Starts a scrape operation and returns immediately with a Job object. @@ -442,9 +433,9 @@ async def sellers_trigger_async( ScrapeJob object for status checking and result fetching Example: - >>> job = await scraper.sellers_trigger_async("https://amazon.com/sp?seller=AXXX") - >>> await job.wait_async() - >>> data = await job.fetch_async() + >>> job = await scraper.sellers_trigger("https://amazon.com/sp?seller=AXXX") + >>> await job.wait() + >>> data = await job.fetch() """ sdk_function = get_caller_function_name() return await self._trigger_scrape_async( @@ -453,28 +444,28 @@ async def sellers_trigger_async( sdk_function=sdk_function or "sellers_trigger", ) - def sellers_trigger( - self, - url: Union[str, List[str]], - ) -> ScrapeJob: - """Trigger Amazon sellers scrape (sync wrapper).""" - return asyncio.run(self.sellers_trigger_async(url)) - async def sellers_status_async(self, snapshot_id: str) -> str: - """Check Amazon sellers scrape status (async).""" + def sellers_trigger_sync(self, url: Union[str, List[str]]) -> ScrapeJob: + """Trigger Amazon sellers scrape (sync version).""" + return asyncio.run(self.sellers_trigger(url)) + + async def sellers_status(self, snapshot_id: str) -> str: + """Check Amazon sellers scrape status.""" return await self._check_status_async(snapshot_id) - def sellers_status(self, snapshot_id: str) -> str: - """Check Amazon sellers scrape status (sync wrapper).""" - return asyncio.run(self.sellers_status_async(snapshot_id)) - async def sellers_fetch_async(self, snapshot_id: str) -> Any: - """Fetch Amazon sellers scrape results (async).""" + def sellers_status_sync(self, snapshot_id: str) -> str: + """Check Amazon sellers scrape status (sync version).""" + return asyncio.run(self.sellers_status(snapshot_id)) + + async def sellers_fetch(self, snapshot_id: str) -> Any: + """Fetch Amazon sellers scrape results.""" return await self._fetch_results_async(snapshot_id) - def sellers_fetch(self, snapshot_id: str) -> Any: - """Fetch Amazon sellers scrape results (sync wrapper).""" - return asyncio.run(self.sellers_fetch_async(snapshot_id)) + + def sellers_fetch_sync(self, snapshot_id: str) -> Any: + """Fetch Amazon sellers scrape results (sync version).""" + return asyncio.run(self.sellers_fetch(snapshot_id)) # ============================================================================ # CORE SCRAPING LOGIC (Standard async workflow) diff --git a/src/brightdata/scrapers/amazon/search.py b/src/brightdata/scrapers/amazon/search.py index 802b7d9..dde58b4 100644 --- a/src/brightdata/scrapers/amazon/search.py +++ b/src/brightdata/scrapers/amazon/search.py @@ -2,8 +2,10 @@ Amazon Search Scraper - Discovery/parameter-based operations. Implements: -- client.search.amazon.products() - Find products by keyword/category/filters -- client.search.amazon.best_sellers() - Find best sellers by category +- client.search.amazon.products() - Find products by keyword/category/filters (async) +- client.search.amazon.products_sync() - Find products by keyword/category/filters (sync) + +Async methods are the default. Sync methods use asyncio.run() internally. """ import asyncio @@ -58,7 +60,7 @@ def __init__(self, bearer_token: str, engine: Optional[AsyncEngine] = None): # PRODUCTS SEARCH (by keyword + filters) # ============================================================================ - async def products_async( + async def products( self, keyword: Optional[Union[str, List[str]]] = None, url: Optional[Union[str, List[str]]] = None, @@ -71,7 +73,7 @@ async def products_async( timeout: int = DEFAULT_TIMEOUT_MEDIUM, ) -> ScrapeResult: """ - Search Amazon products by keyword and filters (async). + Search Amazon products by keyword and filters. Args: keyword: Search keyword(s) (e.g., "laptop", "wireless headphones") @@ -88,18 +90,13 @@ async def products_async( ScrapeResult with matching products Example: - >>> # Search by keyword - >>> result = await scraper.products_async( - ... keyword="laptop", - ... min_price=50000, # $500 in cents - ... max_price=200000, # $2000 in cents - ... prime_eligible=True - ... ) - >>> - >>> # Search by category URL - >>> result = await scraper.products_async( - ... url="https://www.amazon.com/s?k=laptop&i=electronics" - ... ) + >>> async with BrightDataClient() as client: + ... result = await client.search.amazon.products( + ... keyword="laptop", + ... min_price=50000, # $500 in cents + ... max_price=200000, # $2000 in cents + ... prime_eligible=True + ... ) """ # At least one search criteria required if not any([keyword, url, category]): @@ -167,7 +164,8 @@ async def products_async( timeout=timeout, ) - def products( + + def products_sync( self, keyword: Optional[Union[str, List[str]]] = None, url: Optional[Union[str, List[str]]] = None, @@ -180,22 +178,13 @@ def products( timeout: int = DEFAULT_TIMEOUT_MEDIUM, ) -> ScrapeResult: """ - Search Amazon products by keyword and filters (sync). + Search Amazon products by keyword and filters (sync version). - See products_async() for documentation. - - Example: - >>> result = scraper.products( - ... keyword="laptop", - ... min_price=50000, - ... max_price=200000, - ... prime_eligible=True - ... ) + See products() for full documentation. """ - async def _run(): async with self.engine: - return await self.products_async( + return await self.products( keyword=keyword, url=url, category=category, @@ -206,7 +195,6 @@ async def _run(): country=country, timeout=timeout, ) - return asyncio.run(_run()) # ============================================================================ diff --git a/src/brightdata/scrapers/base.py b/src/brightdata/scrapers/base.py index 277dd67..ece8f74 100644 --- a/src/brightdata/scrapers/base.py +++ b/src/brightdata/scrapers/base.py @@ -343,6 +343,28 @@ def _fetch_results(self, snapshot_id: str, format: str = "json") -> Any: """Fetch scrape job results (internal sync wrapper).""" return _run_blocking(self._fetch_results_async(snapshot_id, format=format)) + # ============================================================================ + # CONTEXT MANAGER SUPPORT (for standalone usage) + # ============================================================================ + + async def __aenter__(self): + """ + Async context manager entry for standalone scraper usage. + + When using a scraper directly (not through BrightDataClient), + use the context manager to ensure proper engine lifecycle management. + + Example: + >>> async with AmazonScraper(token="...") as scraper: + ... result = await scraper.products(url) + """ + await self.engine.__aenter__() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + """Async context manager exit - cleanup engine.""" + await self.engine.__aexit__(exc_type, exc_val, exc_tb) + def __repr__(self) -> str: """String representation for debugging.""" platform = self.PLATFORM_NAME or self.__class__.__name__ diff --git a/src/brightdata/scrapers/chatgpt/scraper.py b/src/brightdata/scrapers/chatgpt/scraper.py index d7ede3d..eb159e6 100644 --- a/src/brightdata/scrapers/chatgpt/scraper.py +++ b/src/brightdata/scrapers/chatgpt/scraper.py @@ -28,16 +28,25 @@ class ChatGPTScraper(BaseWebScraper): Supports prompts with optional web search and follow-up conversations. Methods: - prompt(): Single prompt interaction - prompts(): Batch prompt processing + prompt(): Single prompt interaction (async) + prompt_sync(): Single prompt interaction (sync) + prompts(): Batch prompt processing (async) + prompts_sync(): Batch prompt processing (sync) Example: >>> scraper = ChatGPTScraper(bearer_token="token") - >>> result = scraper.prompt( + >>> + >>> # Async + >>> result = await scraper.prompt( + ... prompt="Explain async programming in Python", + ... web_search=False + ... ) + >>> + >>> # Sync + >>> result = scraper.prompt_sync( ... prompt="Explain async programming in Python", ... web_search=False ... ) - >>> print(result.data) """ DATASET_ID = "gd_m7aof0k82r803d5bjm" # ChatGPT dataset @@ -49,7 +58,7 @@ class ChatGPTScraper(BaseWebScraper): # PROMPT METHODS # ============================================================================ - async def prompt_async( + async def prompt( self, prompt: str, country: str = "us", @@ -73,7 +82,7 @@ async def prompt_async( ScrapeResult with ChatGPT response Example: - >>> result = await scraper.prompt_async( + >>> result = await scraper.prompt( ... prompt="What are the latest trends in AI?", ... web_search=True ... ) @@ -111,19 +120,34 @@ async def prompt_async( return result - def prompt(self, prompt: str, **kwargs) -> ScrapeResult: + + def prompt_sync( + self, + prompt: str, + country: str = "us", + web_search: bool = False, + additional_prompt: Optional[str] = None, + poll_interval: int = DEFAULT_POLL_INTERVAL, + poll_timeout: Optional[int] = None, + ) -> ScrapeResult: """ Send prompt to ChatGPT (sync). - See prompt_async() for full documentation. + See prompt() for full documentation. Example: - >>> result = scraper.prompt("Explain Python asyncio") + >>> result = scraper.prompt_sync("Explain Python asyncio") """ - async def _run(): async with self.engine: - return await self.prompt_async(prompt, **kwargs) + return await self.prompt( + prompt=prompt, + country=country, + web_search=web_search, + additional_prompt=additional_prompt, + poll_interval=poll_interval, + poll_timeout=poll_timeout, + ) return asyncio.run(_run()) @@ -131,7 +155,7 @@ async def _run(): # PROMPT TRIGGER/STATUS/FETCH (Manual Control) # ============================================================================ - async def prompt_trigger_async( + async def prompt_trigger( self, prompt: str, country: str = "us", @@ -167,7 +191,8 @@ async def prompt_trigger_async( cost_per_record=self.COST_PER_RECORD, ) - def prompt_trigger( + + def prompt_trigger_sync( self, prompt: str, country: str = "us", @@ -175,27 +200,40 @@ def prompt_trigger( additional_prompt: Optional[str] = None, ) -> "ScrapeJob": """Trigger ChatGPT prompt (sync wrapper).""" - return asyncio.run( - self.prompt_trigger_async(prompt, country, web_search, additional_prompt) - ) + async def _run(): + async with self.engine: + return await self.prompt_trigger(prompt, country, web_search, additional_prompt) + return asyncio.run(_run()) - async def prompt_status_async(self, snapshot_id: str) -> str: + async def prompt_status(self, snapshot_id: str) -> str: """Check ChatGPT prompt status (async).""" return await self._check_status_async(snapshot_id) - def prompt_status(self, snapshot_id: str) -> str: + + def prompt_status_sync(self, snapshot_id: str) -> str: """Check ChatGPT prompt status (sync wrapper).""" - return asyncio.run(self.prompt_status_async(snapshot_id)) + async def _run(): + async with self.engine: + return await self.prompt_status(snapshot_id) + return asyncio.run(_run()) - async def prompt_fetch_async(self, snapshot_id: str) -> Any: + async def prompt_fetch(self, snapshot_id: str) -> Any: """Fetch ChatGPT prompt results (async).""" return await self._fetch_results_async(snapshot_id) - def prompt_fetch(self, snapshot_id: str) -> Any: + + def prompt_fetch_sync(self, snapshot_id: str) -> Any: """Fetch ChatGPT prompt results (sync wrapper).""" - return asyncio.run(self.prompt_fetch_async(snapshot_id)) + async def _run(): + async with self.engine: + return await self.prompt_fetch(snapshot_id) + return asyncio.run(_run()) - async def prompts_async( + # ============================================================================ + # BATCH PROMPTS METHODS + # ============================================================================ + + async def prompts( self, prompts: List[str], countries: Optional[List[str]] = None, @@ -219,7 +257,7 @@ async def prompts_async( ScrapeResult with list of ChatGPT responses Example: - >>> result = await scraper.prompts_async( + >>> result = await scraper.prompts( ... prompts=[ ... "Explain Python", ... "Explain JavaScript", @@ -262,16 +300,31 @@ async def prompts_async( return result - def prompts(self, prompts: List[str], **kwargs) -> ScrapeResult: + + def prompts_sync( + self, + prompts: List[str], + countries: Optional[List[str]] = None, + web_searches: Optional[List[bool]] = None, + additional_prompts: Optional[List[str]] = None, + poll_interval: int = DEFAULT_POLL_INTERVAL, + poll_timeout: Optional[int] = None, + ) -> ScrapeResult: """ Send multiple prompts (sync). - See prompts_async() for full documentation. + See prompts() for full documentation. """ - async def _run(): async with self.engine: - return await self.prompts_async(prompts, **kwargs) + return await self.prompts( + prompts=prompts, + countries=countries, + web_searches=web_searches, + additional_prompts=additional_prompts, + poll_interval=poll_interval, + poll_timeout=poll_timeout, + ) return asyncio.run(_run()) @@ -279,7 +332,7 @@ async def _run(): # PROMPTS TRIGGER/STATUS/FETCH (Manual Control for batch) # ============================================================================ - async def prompts_trigger_async( + async def prompts_trigger( self, prompts: List[str], countries: Optional[List[str]] = None, @@ -315,7 +368,8 @@ async def prompts_trigger_async( cost_per_record=self.COST_PER_RECORD, ) - def prompts_trigger( + + def prompts_trigger_sync( self, prompts: List[str], countries: Optional[List[str]] = None, @@ -323,31 +377,40 @@ def prompts_trigger( additional_prompts: Optional[List[str]] = None, ) -> "ScrapeJob": """Trigger ChatGPT batch prompts (sync wrapper).""" - return asyncio.run( - self.prompts_trigger_async(prompts, countries, web_searches, additional_prompts) - ) + async def _run(): + async with self.engine: + return await self.prompts_trigger(prompts, countries, web_searches, additional_prompts) + return asyncio.run(_run()) - async def prompts_status_async(self, snapshot_id: str) -> str: + async def prompts_status(self, snapshot_id: str) -> str: """Check ChatGPT batch prompts status (async).""" return await self._check_status_async(snapshot_id) - def prompts_status(self, snapshot_id: str) -> str: + + def prompts_status_sync(self, snapshot_id: str) -> str: """Check ChatGPT batch prompts status (sync wrapper).""" - return asyncio.run(self.prompts_status_async(snapshot_id)) + async def _run(): + async with self.engine: + return await self.prompts_status(snapshot_id) + return asyncio.run(_run()) - async def prompts_fetch_async(self, snapshot_id: str) -> Any: + async def prompts_fetch(self, snapshot_id: str) -> Any: """Fetch ChatGPT batch prompts results (async).""" return await self._fetch_results_async(snapshot_id) - def prompts_fetch(self, snapshot_id: str) -> Any: + + def prompts_fetch_sync(self, snapshot_id: str) -> Any: """Fetch ChatGPT batch prompts results (sync wrapper).""" - return asyncio.run(self.prompts_fetch_async(snapshot_id)) + async def _run(): + async with self.engine: + return await self.prompts_fetch(snapshot_id) + return asyncio.run(_run()) # ============================================================================ # SCRAPE OVERRIDE (ChatGPT doesn't use URL-based scraping) # ============================================================================ - async def scrape_async( + async def scrape( self, urls: Union[str, List[str]], **kwargs ) -> Union[ScrapeResult, List[ScrapeResult]]: """ @@ -360,7 +423,8 @@ async def scrape_async( "Use prompt() or prompts() methods instead." ) - def scrape(self, urls: Union[str, List[str]], **kwargs): + + def scrape_sync(self, urls: Union[str, List[str]], **kwargs): """ChatGPT doesn't support URL-based scraping.""" raise NotImplementedError( "ChatGPT scraper doesn't support URL-based scraping. " diff --git a/src/brightdata/scrapers/chatgpt/search.py b/src/brightdata/scrapers/chatgpt/search.py index 30cf123..a3b5914 100644 --- a/src/brightdata/scrapers/chatgpt/search.py +++ b/src/brightdata/scrapers/chatgpt/search.py @@ -2,7 +2,8 @@ ChatGPT Search Service - Prompt-based discovery. API Specification: -- client.search.chatGPT(prompt, country, secondaryPrompt, webSearch, timeout) +- client.search.chatGPT(prompt, country, secondaryPrompt, webSearch, timeout) - async +- client.search.chatGPT_sync(prompt, country, secondaryPrompt, webSearch, timeout) - sync All parameters accept str | array or bool | array Uses standard async workflow (trigger/poll/fetch). @@ -29,7 +30,17 @@ class ChatGPTSearchService: Example: >>> search = ChatGPTSearchService(bearer_token="token") - >>> result = search.chatGPT( + >>> + >>> # Async + >>> result = await search.chatGPT( + ... prompt="Explain Python async programming", + ... country="us", + ... webSearch=True, + ... timeout=180 + ... ) + >>> + >>> # Sync + >>> result = search.chatGPT_sync( ... prompt="Explain Python async programming", ... country="us", ... webSearch=True, @@ -61,7 +72,7 @@ def __init__(self, bearer_token: str, engine: Optional[AsyncEngine] = None): # CHATGPT PROMPT DISCOVERY # ============================================================================ - async def chatGPT_async( + async def chatGPT( self, prompt: Union[str, List[str]], country: Optional[Union[str, List[str]]] = None, @@ -85,7 +96,7 @@ async def chatGPT_async( ScrapeResult with ChatGPT response(s) Example: - >>> result = await search.chatGPT_async( + >>> result = await search.chatGPT( ... prompt="What is Python?", ... country="us", ... webSearch=True, @@ -93,7 +104,7 @@ async def chatGPT_async( ... ) >>> >>> # Batch prompts - >>> result = await search.chatGPT_async( + >>> result = await search.chatGPT( ... prompt=["What is Python?", "What is JavaScript?"], ... country=["us", "us"], ... webSearch=[False, False] @@ -139,7 +150,8 @@ async def chatGPT_async( return result - def chatGPT( + + def chatGPT_sync( self, prompt: Union[str, List[str]], country: Optional[Union[str, List[str]]] = None, @@ -150,23 +162,24 @@ def chatGPT( """ Send prompt(s) to ChatGPT (sync wrapper). - See chatGPT_async() for full documentation. + See chatGPT() for full documentation. Example: - >>> result = search.chatGPT( + >>> result = search.chatGPT_sync( ... prompt="Explain async programming", ... webSearch=True ... ) """ - return asyncio.run( - self.chatGPT_async( - prompt=prompt, - country=country, - secondaryPrompt=secondaryPrompt, - webSearch=webSearch, - timeout=timeout, - ) - ) + async def _run(): + async with self.engine: + return await self.chatGPT( + prompt=prompt, + country=country, + secondaryPrompt=secondaryPrompt, + webSearch=webSearch, + timeout=timeout, + ) + return asyncio.run(_run()) # ============================================================================ # HELPER METHODS diff --git a/src/brightdata/scrapers/facebook/scraper.py b/src/brightdata/scrapers/facebook/scraper.py index 54b0577..a025bd7 100644 --- a/src/brightdata/scrapers/facebook/scraper.py +++ b/src/brightdata/scrapers/facebook/scraper.py @@ -43,8 +43,15 @@ class FacebookScraper(BaseWebScraper): Example: >>> scraper = FacebookScraper(bearer_token="token") >>> - >>> # Scrape posts from profile - >>> result = scraper.posts_by_profile( + >>> # Async usage + >>> result = await scraper.posts_by_profile( + ... url="https://facebook.com/profile", + ... num_of_posts=10, + ... timeout=240 + ... ) + >>> + >>> # Sync usage + >>> result = scraper.posts_by_profile_sync( ... url="https://facebook.com/profile", ... num_of_posts=10, ... timeout=240 @@ -67,7 +74,7 @@ class FacebookScraper(BaseWebScraper): # POSTS API - By Profile URL # ============================================================================ - async def posts_by_profile_async( + async def posts_by_profile( self, url: Union[str, List[str]], num_of_posts: Optional[int] = None, @@ -94,7 +101,7 @@ async def posts_by_profile_async( ScrapeResult or List[ScrapeResult] with post data Example: - >>> result = await scraper.posts_by_profile_async( + >>> result = await scraper.posts_by_profile( ... url="https://facebook.com/profile", ... num_of_posts=10, ... start_date="01-01-2025", @@ -118,7 +125,7 @@ async def posts_by_profile_async( sdk_function="posts_by_profile", ) - def posts_by_profile( + def posts_by_profile_sync( self, url: Union[str, List[str]], num_of_posts: Optional[int] = None, @@ -131,7 +138,7 @@ def posts_by_profile( async def _run(): async with self.engine: - return await self.posts_by_profile_async( + return await self.posts_by_profile( url, num_of_posts, posts_to_not_include, start_date, end_date, timeout ) @@ -139,7 +146,7 @@ async def _run(): # --- Trigger Interface (Manual Control) --- - async def posts_by_profile_trigger_async( + async def posts_by_profile_trigger( self, url: Union[str, List[str]], num_of_posts: Optional[int] = None, @@ -177,31 +184,31 @@ async def posts_by_profile_trigger_async( cost_per_record=self.COST_PER_RECORD, ) - def posts_by_profile_trigger(self, url: Union[str, List[str]], **kwargs) -> "ScrapeJob": + def posts_by_profile_trigger_sync(self, url: Union[str, List[str]], **kwargs) -> "ScrapeJob": """Trigger Facebook posts by profile scrape (sync wrapper).""" - return asyncio.run(self.posts_by_profile_trigger_async(url, **kwargs)) + return asyncio.run(self.posts_by_profile_trigger(url, **kwargs)) - async def posts_by_profile_status_async(self, snapshot_id: str) -> str: + async def posts_by_profile_status(self, snapshot_id: str) -> str: """Check Facebook posts by profile status (async).""" return await self._check_status_async(snapshot_id) - def posts_by_profile_status(self, snapshot_id: str) -> str: + def posts_by_profile_status_sync(self, snapshot_id: str) -> str: """Check Facebook posts by profile status (sync wrapper).""" - return asyncio.run(self.posts_by_profile_status_async(snapshot_id)) + return asyncio.run(self.posts_by_profile_status(snapshot_id)) - async def posts_by_profile_fetch_async(self, snapshot_id: str) -> Any: + async def posts_by_profile_fetch(self, snapshot_id: str) -> Any: """Fetch Facebook posts by profile results (async).""" return await self._fetch_results_async(snapshot_id) - def posts_by_profile_fetch(self, snapshot_id: str) -> Any: + def posts_by_profile_fetch_sync(self, snapshot_id: str) -> Any: """Fetch Facebook posts by profile results (sync wrapper).""" - return asyncio.run(self.posts_by_profile_fetch_async(snapshot_id)) + return asyncio.run(self.posts_by_profile_fetch(snapshot_id)) # ============================================================================ # POSTS API - By Group URL # ============================================================================ - async def posts_by_group_async( + async def posts_by_group( self, url: Union[str, List[str]], num_of_posts: Optional[int] = None, @@ -228,7 +235,7 @@ async def posts_by_group_async( ScrapeResult or List[ScrapeResult] with post data Example: - >>> result = await scraper.posts_by_group_async( + >>> result = await scraper.posts_by_group( ... url="https://facebook.com/groups/example", ... num_of_posts=20, ... timeout=240 @@ -250,7 +257,7 @@ async def posts_by_group_async( sdk_function="posts_by_group", ) - def posts_by_group( + def posts_by_group_sync( self, url: Union[str, List[str]], num_of_posts: Optional[int] = None, @@ -263,7 +270,7 @@ def posts_by_group( async def _run(): async with self.engine: - return await self.posts_by_group_async( + return await self.posts_by_group( url, num_of_posts, posts_to_not_include, start_date, end_date, timeout ) @@ -271,7 +278,7 @@ async def _run(): # --- Trigger Interface (Manual Control) --- - async def posts_by_group_trigger_async( + async def posts_by_group_trigger( self, url: Union[str, List[str]], **kwargs ) -> "ScrapeJob": """Trigger Facebook posts by group scrape (async - manual control).""" @@ -292,31 +299,31 @@ async def posts_by_group_trigger_async( cost_per_record=self.COST_PER_RECORD, ) - def posts_by_group_trigger(self, url: Union[str, List[str]], **kwargs) -> "ScrapeJob": + def posts_by_group_trigger_sync(self, url: Union[str, List[str]], **kwargs) -> "ScrapeJob": """Trigger Facebook posts by group scrape (sync wrapper).""" - return asyncio.run(self.posts_by_group_trigger_async(url, **kwargs)) + return asyncio.run(self.posts_by_group_trigger(url, **kwargs)) - async def posts_by_group_status_async(self, snapshot_id: str) -> str: + async def posts_by_group_status(self, snapshot_id: str) -> str: """Check Facebook posts by group status (async).""" return await self._check_status_async(snapshot_id) - def posts_by_group_status(self, snapshot_id: str) -> str: + def posts_by_group_status_sync(self, snapshot_id: str) -> str: """Check Facebook posts by group status (sync wrapper).""" - return asyncio.run(self.posts_by_group_status_async(snapshot_id)) + return asyncio.run(self.posts_by_group_status(snapshot_id)) - async def posts_by_group_fetch_async(self, snapshot_id: str) -> Any: + async def posts_by_group_fetch(self, snapshot_id: str) -> Any: """Fetch Facebook posts by group results (async).""" return await self._fetch_results_async(snapshot_id) - def posts_by_group_fetch(self, snapshot_id: str) -> Any: + def posts_by_group_fetch_sync(self, snapshot_id: str) -> Any: """Fetch Facebook posts by group results (sync wrapper).""" - return asyncio.run(self.posts_by_group_fetch_async(snapshot_id)) + return asyncio.run(self.posts_by_group_fetch(snapshot_id)) # ============================================================================ # POSTS API - By Post URL # ============================================================================ - async def posts_by_url_async( + async def posts_by_url( self, url: Union[str, List[str]], timeout: int = DEFAULT_TIMEOUT_MEDIUM, @@ -335,7 +342,7 @@ async def posts_by_url_async( ScrapeResult or List[ScrapeResult] with post data Example: - >>> result = await scraper.posts_by_url_async( + >>> result = await scraper.posts_by_url( ... url="https://facebook.com/post/123456", ... timeout=240 ... ) @@ -352,7 +359,7 @@ async def posts_by_url_async( sdk_function="posts_by_url", ) - def posts_by_url( + def posts_by_url_sync( self, url: Union[str, List[str]], timeout: int = DEFAULT_TIMEOUT_MEDIUM, @@ -361,13 +368,13 @@ def posts_by_url( async def _run(): async with self.engine: - return await self.posts_by_url_async(url, timeout) + return await self.posts_by_url(url, timeout) return asyncio.run(_run()) # --- Trigger Interface (Manual Control) --- - async def posts_by_url_trigger_async(self, url: Union[str, List[str]]) -> "ScrapeJob": + async def posts_by_url_trigger(self, url: Union[str, List[str]]) -> "ScrapeJob": """Trigger Facebook posts by URL scrape (async - manual control).""" sdk_function = get_caller_function_name() @@ -377,31 +384,31 @@ async def posts_by_url_trigger_async(self, url: Union[str, List[str]]) -> "Scrap sdk_function=sdk_function or "posts_by_url_trigger", ) - def posts_by_url_trigger(self, url: Union[str, List[str]]) -> "ScrapeJob": + def posts_by_url_trigger_sync(self, url: Union[str, List[str]]) -> "ScrapeJob": """Trigger Facebook posts by URL scrape (sync wrapper).""" - return asyncio.run(self.posts_by_url_trigger_async(url)) + return asyncio.run(self.posts_by_url_trigger(url)) - async def posts_by_url_status_async(self, snapshot_id: str) -> str: + async def posts_by_url_status(self, snapshot_id: str) -> str: """Check Facebook posts by URL status (async).""" return await self._check_status_async(snapshot_id) - def posts_by_url_status(self, snapshot_id: str) -> str: + def posts_by_url_status_sync(self, snapshot_id: str) -> str: """Check Facebook posts by URL status (sync wrapper).""" - return asyncio.run(self.posts_by_url_status_async(snapshot_id)) + return asyncio.run(self.posts_by_url_status(snapshot_id)) - async def posts_by_url_fetch_async(self, snapshot_id: str) -> Any: + async def posts_by_url_fetch(self, snapshot_id: str) -> Any: """Fetch Facebook posts by URL results (async).""" return await self._fetch_results_async(snapshot_id) - def posts_by_url_fetch(self, snapshot_id: str) -> Any: + def posts_by_url_fetch_sync(self, snapshot_id: str) -> Any: """Fetch Facebook posts by URL results (sync wrapper).""" - return asyncio.run(self.posts_by_url_fetch_async(snapshot_id)) + return asyncio.run(self.posts_by_url_fetch(snapshot_id)) # ============================================================================ # COMMENTS API - By Post URL # ============================================================================ - async def comments_async( + async def comments( self, url: Union[str, List[str]], num_of_comments: Optional[int] = None, @@ -428,7 +435,7 @@ async def comments_async( ScrapeResult or List[ScrapeResult] with comment data Example: - >>> result = await scraper.comments_async( + >>> result = await scraper.comments( ... url="https://facebook.com/post/123456", ... num_of_comments=100, ... start_date="01-01-2025", @@ -452,7 +459,7 @@ async def comments_async( sdk_function="comments", ) - def comments( + def comments_sync( self, url: Union[str, List[str]], num_of_comments: Optional[int] = None, @@ -465,7 +472,7 @@ def comments( async def _run(): async with self.engine: - return await self.comments_async( + return await self.comments( url, num_of_comments, comments_to_not_include, start_date, end_date, timeout ) @@ -473,7 +480,7 @@ async def _run(): # --- Trigger Interface (Manual Control) --- - async def comments_trigger_async(self, url: Union[str, List[str]], **kwargs) -> "ScrapeJob": + async def comments_trigger(self, url: Union[str, List[str]], **kwargs) -> "ScrapeJob": """Trigger Facebook comments scrape (async - manual control).""" from ..job import ScrapeJob @@ -492,31 +499,31 @@ async def comments_trigger_async(self, url: Union[str, List[str]], **kwargs) -> cost_per_record=self.COST_PER_RECORD, ) - def comments_trigger(self, url: Union[str, List[str]], **kwargs) -> "ScrapeJob": + def comments_trigger_sync(self, url: Union[str, List[str]], **kwargs) -> "ScrapeJob": """Trigger Facebook comments scrape (sync wrapper).""" - return asyncio.run(self.comments_trigger_async(url, **kwargs)) + return asyncio.run(self.comments_trigger(url, **kwargs)) - async def comments_status_async(self, snapshot_id: str) -> str: + async def comments_status(self, snapshot_id: str) -> str: """Check Facebook comments status (async).""" return await self._check_status_async(snapshot_id) - def comments_status(self, snapshot_id: str) -> str: + def comments_status_sync(self, snapshot_id: str) -> str: """Check Facebook comments status (sync wrapper).""" - return asyncio.run(self.comments_status_async(snapshot_id)) + return asyncio.run(self.comments_status(snapshot_id)) - async def comments_fetch_async(self, snapshot_id: str) -> Any: + async def comments_fetch(self, snapshot_id: str) -> Any: """Fetch Facebook comments results (async).""" return await self._fetch_results_async(snapshot_id) - def comments_fetch(self, snapshot_id: str) -> Any: + def comments_fetch_sync(self, snapshot_id: str) -> Any: """Fetch Facebook comments results (sync wrapper).""" - return asyncio.run(self.comments_fetch_async(snapshot_id)) + return asyncio.run(self.comments_fetch(snapshot_id)) # ============================================================================ # REELS API - By Profile URL # ============================================================================ - async def reels_async( + async def reels( self, url: Union[str, List[str]], num_of_posts: Optional[int] = None, @@ -543,7 +550,7 @@ async def reels_async( ScrapeResult or List[ScrapeResult] with reel data Example: - >>> result = await scraper.reels_async( + >>> result = await scraper.reels( ... url="https://facebook.com/profile", ... num_of_posts=50, ... timeout=240 @@ -565,7 +572,7 @@ async def reels_async( sdk_function="reels", ) - def reels( + def reels_sync( self, url: Union[str, List[str]], num_of_posts: Optional[int] = None, @@ -578,7 +585,7 @@ def reels( async def _run(): async with self.engine: - return await self.reels_async( + return await self.reels( url, num_of_posts, posts_to_not_include, start_date, end_date, timeout ) @@ -586,7 +593,7 @@ async def _run(): # --- Trigger Interface (Manual Control) --- - async def reels_trigger_async(self, url: Union[str, List[str]], **kwargs) -> "ScrapeJob": + async def reels_trigger(self, url: Union[str, List[str]], **kwargs) -> "ScrapeJob": """Trigger Facebook reels scrape (async - manual control).""" from ..job import ScrapeJob @@ -605,25 +612,25 @@ async def reels_trigger_async(self, url: Union[str, List[str]], **kwargs) -> "Sc cost_per_record=self.COST_PER_RECORD, ) - def reels_trigger(self, url: Union[str, List[str]], **kwargs) -> "ScrapeJob": + def reels_trigger_sync(self, url: Union[str, List[str]], **kwargs) -> "ScrapeJob": """Trigger Facebook reels scrape (sync wrapper).""" - return asyncio.run(self.reels_trigger_async(url, **kwargs)) + return asyncio.run(self.reels_trigger(url, **kwargs)) - async def reels_status_async(self, snapshot_id: str) -> str: + async def reels_status(self, snapshot_id: str) -> str: """Check Facebook reels status (async).""" return await self._check_status_async(snapshot_id) - def reels_status(self, snapshot_id: str) -> str: + def reels_status_sync(self, snapshot_id: str) -> str: """Check Facebook reels status (sync wrapper).""" - return asyncio.run(self.reels_status_async(snapshot_id)) + return asyncio.run(self.reels_status(snapshot_id)) - async def reels_fetch_async(self, snapshot_id: str) -> Any: + async def reels_fetch(self, snapshot_id: str) -> Any: """Fetch Facebook reels results (async).""" return await self._fetch_results_async(snapshot_id) - def reels_fetch(self, snapshot_id: str) -> Any: + def reels_fetch_sync(self, snapshot_id: str) -> Any: """Fetch Facebook reels results (sync wrapper).""" - return asyncio.run(self.reels_fetch_async(snapshot_id)) + return asyncio.run(self.reels_fetch(snapshot_id)) # ============================================================================ # CORE SCRAPING LOGIC @@ -752,7 +759,7 @@ async def _scrape_with_params( poll_timeout=timeout, include_errors=True, normalize_func=self.normalize_result, - sdk_function="posts_by_profile", + sdk_function=sdk_function or "posts_by_profile", ) if is_single and isinstance(result.data, list) and len(result.data) == 1: diff --git a/src/brightdata/scrapers/instagram/scraper.py b/src/brightdata/scrapers/instagram/scraper.py index 27f699b..c27374c 100644 --- a/src/brightdata/scrapers/instagram/scraper.py +++ b/src/brightdata/scrapers/instagram/scraper.py @@ -44,8 +44,14 @@ class InstagramScraper(BaseWebScraper): Example: >>> scraper = InstagramScraper(bearer_token="token") >>> - >>> # Scrape profile - >>> result = scraper.profiles( + >>> # Async usage + >>> result = await scraper.profiles( + ... url="https://instagram.com/username", + ... timeout=240 + ... ) + >>> + >>> # Sync usage + >>> result = scraper.profiles_sync( ... url="https://instagram.com/username", ... timeout=240 ... ) @@ -66,7 +72,7 @@ class InstagramScraper(BaseWebScraper): # PROFILES API - By URL # ============================================================================ - async def profiles_async( + async def profiles( self, url: Union[str, List[str]], timeout: int = DEFAULT_TIMEOUT_MEDIUM, @@ -85,7 +91,7 @@ async def profiles_async( ScrapeResult or List[ScrapeResult] with profile data Example: - >>> result = await scraper.profiles_async( + >>> result = await scraper.profiles( ... url="https://instagram.com/username", ... timeout=240 ... ) @@ -102,7 +108,7 @@ async def profiles_async( sdk_function="profiles", ) - def profiles( + def profiles_sync( self, url: Union[str, List[str]], timeout: int = DEFAULT_TIMEOUT_MEDIUM, @@ -111,13 +117,13 @@ def profiles( async def _run(): async with self.engine: - return await self.profiles_async(url, timeout) + return await self.profiles(url, timeout) return asyncio.run(_run()) # --- Trigger Interface (Manual Control) --- - async def profiles_trigger_async(self, url: Union[str, List[str]]) -> "ScrapeJob": + async def profiles_trigger(self, url: Union[str, List[str]]) -> "ScrapeJob": """Trigger Instagram profiles scrape (async - manual control).""" sdk_function = get_caller_function_name() @@ -127,31 +133,31 @@ async def profiles_trigger_async(self, url: Union[str, List[str]]) -> "ScrapeJob sdk_function=sdk_function or "profiles_trigger", ) - def profiles_trigger(self, url: Union[str, List[str]]) -> "ScrapeJob": + def profiles_trigger_sync(self, url: Union[str, List[str]]) -> "ScrapeJob": """Trigger Instagram profiles scrape (sync wrapper).""" - return asyncio.run(self.profiles_trigger_async(url)) + return asyncio.run(self.profiles_trigger(url)) - async def profiles_status_async(self, snapshot_id: str) -> str: + async def profiles_status(self, snapshot_id: str) -> str: """Check Instagram profiles status (async).""" return await self._check_status_async(snapshot_id) - def profiles_status(self, snapshot_id: str) -> str: + def profiles_status_sync(self, snapshot_id: str) -> str: """Check Instagram profiles status (sync wrapper).""" - return asyncio.run(self.profiles_status_async(snapshot_id)) + return asyncio.run(self.profiles_status(snapshot_id)) - async def profiles_fetch_async(self, snapshot_id: str) -> Any: + async def profiles_fetch(self, snapshot_id: str) -> Any: """Fetch Instagram profiles results (async).""" return await self._fetch_results_async(snapshot_id) - def profiles_fetch(self, snapshot_id: str) -> Any: + def profiles_fetch_sync(self, snapshot_id: str) -> Any: """Fetch Instagram profiles results (sync wrapper).""" - return asyncio.run(self.profiles_fetch_async(snapshot_id)) + return asyncio.run(self.profiles_fetch(snapshot_id)) # ============================================================================ # POSTS API - By URL # ============================================================================ - async def posts_async( + async def posts( self, url: Union[str, List[str]], timeout: int = DEFAULT_TIMEOUT_MEDIUM, @@ -170,7 +176,7 @@ async def posts_async( ScrapeResult or List[ScrapeResult] with post data Example: - >>> result = await scraper.posts_async( + >>> result = await scraper.posts( ... url="https://instagram.com/p/ABC123", ... timeout=240 ... ) @@ -187,7 +193,7 @@ async def posts_async( sdk_function="posts", ) - def posts( + def posts_sync( self, url: Union[str, List[str]], timeout: int = DEFAULT_TIMEOUT_MEDIUM, @@ -196,13 +202,13 @@ def posts( async def _run(): async with self.engine: - return await self.posts_async(url, timeout) + return await self.posts(url, timeout) return asyncio.run(_run()) # --- Trigger Interface (Manual Control) --- - async def posts_trigger_async(self, url: Union[str, List[str]]) -> "ScrapeJob": + async def posts_trigger(self, url: Union[str, List[str]]) -> "ScrapeJob": """Trigger Instagram posts scrape (async - manual control).""" sdk_function = get_caller_function_name() @@ -210,31 +216,31 @@ async def posts_trigger_async(self, url: Union[str, List[str]]) -> "ScrapeJob": urls=url, dataset_id=self.DATASET_ID_POSTS, sdk_function=sdk_function or "posts_trigger" ) - def posts_trigger(self, url: Union[str, List[str]]) -> "ScrapeJob": + def posts_trigger_sync(self, url: Union[str, List[str]]) -> "ScrapeJob": """Trigger Instagram posts scrape (sync wrapper).""" - return asyncio.run(self.posts_trigger_async(url)) + return asyncio.run(self.posts_trigger(url)) - async def posts_status_async(self, snapshot_id: str) -> str: + async def posts_status(self, snapshot_id: str) -> str: """Check Instagram posts status (async).""" return await self._check_status_async(snapshot_id) - def posts_status(self, snapshot_id: str) -> str: + def posts_status_sync(self, snapshot_id: str) -> str: """Check Instagram posts status (sync wrapper).""" - return asyncio.run(self.posts_status_async(snapshot_id)) + return asyncio.run(self.posts_status(snapshot_id)) - async def posts_fetch_async(self, snapshot_id: str) -> Any: + async def posts_fetch(self, snapshot_id: str) -> Any: """Fetch Instagram posts results (async).""" return await self._fetch_results_async(snapshot_id) - def posts_fetch(self, snapshot_id: str) -> Any: + def posts_fetch_sync(self, snapshot_id: str) -> Any: """Fetch Instagram posts results (sync wrapper).""" - return asyncio.run(self.posts_fetch_async(snapshot_id)) + return asyncio.run(self.posts_fetch(snapshot_id)) # ============================================================================ # COMMENTS API - By Post URL # ============================================================================ - async def comments_async( + async def comments( self, url: Union[str, List[str]], timeout: int = DEFAULT_TIMEOUT_MEDIUM, @@ -253,7 +259,7 @@ async def comments_async( ScrapeResult or List[ScrapeResult] with comment data Example: - >>> result = await scraper.comments_async( + >>> result = await scraper.comments( ... url="https://instagram.com/p/ABC123", ... timeout=240 ... ) @@ -270,7 +276,7 @@ async def comments_async( sdk_function="comments", ) - def comments( + def comments_sync( self, url: Union[str, List[str]], timeout: int = DEFAULT_TIMEOUT_MEDIUM, @@ -279,13 +285,13 @@ def comments( async def _run(): async with self.engine: - return await self.comments_async(url, timeout) + return await self.comments(url, timeout) return asyncio.run(_run()) # --- Trigger Interface (Manual Control) --- - async def comments_trigger_async(self, url: Union[str, List[str]]) -> "ScrapeJob": + async def comments_trigger(self, url: Union[str, List[str]]) -> "ScrapeJob": """Trigger Instagram comments scrape (async - manual control).""" sdk_function = get_caller_function_name() @@ -295,31 +301,31 @@ async def comments_trigger_async(self, url: Union[str, List[str]]) -> "ScrapeJob sdk_function=sdk_function or "comments_trigger", ) - def comments_trigger(self, url: Union[str, List[str]]) -> "ScrapeJob": + def comments_trigger_sync(self, url: Union[str, List[str]]) -> "ScrapeJob": """Trigger Instagram comments scrape (sync wrapper).""" - return asyncio.run(self.comments_trigger_async(url)) + return asyncio.run(self.comments_trigger(url)) - async def comments_status_async(self, snapshot_id: str) -> str: + async def comments_status(self, snapshot_id: str) -> str: """Check Instagram comments status (async).""" return await self._check_status_async(snapshot_id) - def comments_status(self, snapshot_id: str) -> str: + def comments_status_sync(self, snapshot_id: str) -> str: """Check Instagram comments status (sync wrapper).""" - return asyncio.run(self.comments_status_async(snapshot_id)) + return asyncio.run(self.comments_status(snapshot_id)) - async def comments_fetch_async(self, snapshot_id: str) -> Any: + async def comments_fetch(self, snapshot_id: str) -> Any: """Fetch Instagram comments results (async).""" return await self._fetch_results_async(snapshot_id) - def comments_fetch(self, snapshot_id: str) -> Any: + def comments_fetch_sync(self, snapshot_id: str) -> Any: """Fetch Instagram comments results (sync wrapper).""" - return asyncio.run(self.comments_fetch_async(snapshot_id)) + return asyncio.run(self.comments_fetch(snapshot_id)) # ============================================================================ # REELS API - By URL # ============================================================================ - async def reels_async( + async def reels( self, url: Union[str, List[str]], timeout: int = DEFAULT_TIMEOUT_MEDIUM, @@ -338,7 +344,7 @@ async def reels_async( ScrapeResult or List[ScrapeResult] with reel data Example: - >>> result = await scraper.reels_async( + >>> result = await scraper.reels( ... url="https://instagram.com/reel/ABC123", ... timeout=240 ... ) @@ -355,7 +361,7 @@ async def reels_async( sdk_function="reels", ) - def reels( + def reels_sync( self, url: Union[str, List[str]], timeout: int = DEFAULT_TIMEOUT_MEDIUM, @@ -364,13 +370,13 @@ def reels( async def _run(): async with self.engine: - return await self.reels_async(url, timeout) + return await self.reels(url, timeout) return asyncio.run(_run()) # --- Trigger Interface (Manual Control) --- - async def reels_trigger_async(self, url: Union[str, List[str]]) -> "ScrapeJob": + async def reels_trigger(self, url: Union[str, List[str]]) -> "ScrapeJob": """Trigger Instagram reels scrape (async - manual control).""" sdk_function = get_caller_function_name() @@ -378,25 +384,25 @@ async def reels_trigger_async(self, url: Union[str, List[str]]) -> "ScrapeJob": urls=url, dataset_id=self.DATASET_ID_REELS, sdk_function=sdk_function or "reels_trigger" ) - def reels_trigger(self, url: Union[str, List[str]]) -> "ScrapeJob": + def reels_trigger_sync(self, url: Union[str, List[str]]) -> "ScrapeJob": """Trigger Instagram reels scrape (sync wrapper).""" - return asyncio.run(self.reels_trigger_async(url)) + return asyncio.run(self.reels_trigger(url)) - async def reels_status_async(self, snapshot_id: str) -> str: + async def reels_status(self, snapshot_id: str) -> str: """Check Instagram reels status (async).""" return await self._check_status_async(snapshot_id) - def reels_status(self, snapshot_id: str) -> str: + def reels_status_sync(self, snapshot_id: str) -> str: """Check Instagram reels status (sync wrapper).""" - return asyncio.run(self.reels_status_async(snapshot_id)) + return asyncio.run(self.reels_status(snapshot_id)) - async def reels_fetch_async(self, snapshot_id: str) -> Any: + async def reels_fetch(self, snapshot_id: str) -> Any: """Fetch Instagram reels results (async).""" return await self._fetch_results_async(snapshot_id) - def reels_fetch(self, snapshot_id: str) -> Any: + def reels_fetch_sync(self, snapshot_id: str) -> Any: """Fetch Instagram reels results (sync wrapper).""" - return asyncio.run(self.reels_fetch_async(snapshot_id)) + return asyncio.run(self.reels_fetch(snapshot_id)) # ============================================================================ # CORE SCRAPING LOGIC diff --git a/src/brightdata/scrapers/instagram/search.py b/src/brightdata/scrapers/instagram/search.py index 4381769..43b347f 100644 --- a/src/brightdata/scrapers/instagram/search.py +++ b/src/brightdata/scrapers/instagram/search.py @@ -29,7 +29,16 @@ class InstagramSearchScraper: Example: >>> scraper = InstagramSearchScraper(bearer_token="token") - >>> result = scraper.posts( + >>> + >>> # Async usage + >>> result = await scraper.posts( + ... url="https://instagram.com/username", + ... num_of_posts=10, + ... post_type="reel" + ... ) + >>> + >>> # Sync usage + >>> result = scraper.posts_sync( ... url="https://instagram.com/username", ... num_of_posts=10, ... post_type="reel" @@ -62,7 +71,7 @@ def __init__(self, bearer_token: str, engine: Optional[AsyncEngine] = None): # POSTS DISCOVERY (by profile URL with filters) # ============================================================================ - async def posts_async( + async def posts( self, url: Union[str, List[str]], num_of_posts: Optional[int] = None, @@ -91,7 +100,7 @@ async def posts_async( ScrapeResult or List[ScrapeResult] with discovered posts Example: - >>> result = await scraper.posts_async( + >>> result = await scraper.posts( ... url="https://instagram.com/username", ... num_of_posts=10, ... start_date="01-01-2025", @@ -115,7 +124,7 @@ async def posts_async( timeout=timeout, ) - def posts( + def posts_sync( self, url: Union[str, List[str]], num_of_posts: Optional[int] = None, @@ -129,7 +138,7 @@ def posts( async def _run(): async with self.engine: - return await self.posts_async( + return await self.posts( url, num_of_posts, posts_to_not_include, @@ -145,7 +154,7 @@ async def _run(): # REELS DISCOVERY (by profile or search URL with filters) # ============================================================================ - async def reels_async( + async def reels( self, url: Union[str, List[str]], num_of_posts: Optional[int] = None, @@ -172,7 +181,7 @@ async def reels_async( ScrapeResult or List[ScrapeResult] with discovered reels Example: - >>> result = await scraper.reels_async( + >>> result = await scraper.reels( ... url="https://instagram.com/username", ... num_of_posts=50, ... start_date="01-01-2025", @@ -196,7 +205,7 @@ async def reels_async( sdk_function="reels", ) - def reels( + def reels_sync( self, url: Union[str, List[str]], num_of_posts: Optional[int] = None, @@ -209,7 +218,7 @@ def reels( async def _run(): async with self.engine: - return await self.reels_async( + return await self.reels( url, num_of_posts, posts_to_not_include, start_date, end_date, timeout ) diff --git a/src/brightdata/scrapers/job.py b/src/brightdata/scrapers/job.py index 4f36e00..bf02391 100644 --- a/src/brightdata/scrapers/job.py +++ b/src/brightdata/scrapers/job.py @@ -3,6 +3,8 @@ Provides convenient methods for checking status and fetching results after triggering a scrape operation. + +All methods are async-only. For sync usage, use SyncBrightDataClient. """ import asyncio @@ -25,19 +27,19 @@ class ScrapeJob: Example: >>> # Trigger and get job - >>> job = await client.scrape.amazon.products_trigger_async(url) + >>> job = await client.scrape.amazon.products_trigger(url) >>> >>> # Check status - >>> status = await job.status_async() + >>> status = await job.status() >>> >>> # Wait for completion - >>> await job.wait_async(timeout=120) + >>> await job.wait(timeout=120) >>> >>> # Fetch results - >>> data = await job.fetch_async() + >>> data = await job.fetch() >>> >>> # Or get as ScrapeResult - >>> result = await job.to_result_async() + >>> result = await job.to_result() """ def __init__( @@ -75,9 +77,9 @@ def __repr__(self) -> str: # ASYNC METHODS # ============================================================================ - async def status_async(self, refresh: bool = True) -> str: + async def status(self, refresh: bool = True) -> str: """ - Check job status (async). + Check job status. Args: refresh: If False, returns cached status if available @@ -86,7 +88,7 @@ async def status_async(self, refresh: bool = True) -> str: Status string: "ready", "in_progress", "error", etc. Example: - >>> status = await job.status_async() + >>> status = await job.status() >>> print(f"Job status: {status}") """ if not refresh and self._cached_status: @@ -95,14 +97,15 @@ async def status_async(self, refresh: bool = True) -> str: self._cached_status = await self._api_client.get_status(self.snapshot_id) return self._cached_status - async def wait_async( + + async def wait( self, timeout: int = 300, poll_interval: int = DEFAULT_POLL_INTERVAL, verbose: bool = False, ) -> str: """ - Wait for job to complete (async). + Wait for job to complete. Args: timeout: Maximum seconds to wait @@ -117,7 +120,7 @@ async def wait_async( APIError: If job fails Example: - >>> await job.wait_async(timeout=120, verbose=True) + >>> await job.wait(timeout=120, verbose=True) >>> print("Job completed!") """ start_time = time.time() @@ -128,7 +131,7 @@ async def wait_async( if elapsed > timeout: raise TimeoutError(f"Job {self.snapshot_id} timed out after {timeout}s") - status = await self.status_async(refresh=True) + status = await self.status(refresh=True) if verbose: print(f" [{elapsed:.1f}s] Job status: {status}") @@ -141,12 +144,13 @@ async def wait_async( # Still in progress (can be "running", "in_progress", "pending", etc.) await asyncio.sleep(poll_interval) - async def fetch_async(self, format: str = "json") -> Any: + + async def fetch(self, format: str = "json") -> Any: """ - Fetch job results (async). + Fetch job results. - Note: Does not check if job is ready. Use wait_async() first - or check status_async() to ensure job is complete. + Note: Does not check if job is ready. Use wait() first + or check status() to ensure job is complete. Args: format: Result format ("json" or "raw") @@ -155,19 +159,20 @@ async def fetch_async(self, format: str = "json") -> Any: Job results Example: - >>> await job.wait_async() - >>> data = await job.fetch_async() + >>> await job.wait() + >>> data = await job.fetch() """ self._cached_data = await self._api_client.fetch_result(self.snapshot_id, format=format) return self._cached_data - async def to_result_async( + + async def to_result( self, timeout: int = 300, poll_interval: int = DEFAULT_POLL_INTERVAL, ) -> ScrapeResult: """ - Wait for completion and return as ScrapeResult (async). + Wait for completion and return as ScrapeResult. Convenience method that combines wait + fetch + result creation. @@ -179,7 +184,7 @@ async def to_result_async( ScrapeResult object Example: - >>> result = await job.to_result_async() + >>> result = await job.to_result() >>> if result.success: ... print(result.data) """ @@ -187,10 +192,10 @@ async def to_result_async( try: # Wait for completion - await self.wait_async(timeout=timeout, poll_interval=poll_interval) + await self.wait(timeout=timeout, poll_interval=poll_interval) # Fetch results - data = await self.fetch_async() + data = await self.fetch() # Calculate timing end_time = datetime.now(timezone.utc) @@ -219,33 +224,3 @@ async def to_result_async( metadata={"snapshot_id": self.snapshot_id}, ) - # ============================================================================ - # SYNC WRAPPERS - # ============================================================================ - - def status(self, refresh: bool = True) -> str: - """Check job status (sync wrapper).""" - return asyncio.run(self.status_async(refresh=refresh)) - - def wait( - self, - timeout: int = 300, - poll_interval: int = DEFAULT_POLL_INTERVAL, - verbose: bool = False, - ) -> str: - """Wait for job to complete (sync wrapper).""" - return asyncio.run( - self.wait_async(timeout=timeout, poll_interval=poll_interval, verbose=verbose) - ) - - def fetch(self, format: str = "json") -> Any: - """Fetch job results (sync wrapper).""" - return asyncio.run(self.fetch_async(format=format)) - - def to_result( - self, - timeout: int = 300, - poll_interval: int = DEFAULT_POLL_INTERVAL, - ) -> ScrapeResult: - """Wait and return as ScrapeResult (sync wrapper).""" - return asyncio.run(self.to_result_async(timeout=timeout, poll_interval=poll_interval)) diff --git a/src/brightdata/scrapers/linkedin/scraper.py b/src/brightdata/scrapers/linkedin/scraper.py index 0eb3e49..b1db4c0 100644 --- a/src/brightdata/scrapers/linkedin/scraper.py +++ b/src/brightdata/scrapers/linkedin/scraper.py @@ -6,10 +6,14 @@ async workflow (trigger/poll/fetch). API Specifications: -- client.scrape.linkedin.posts(url, timeout=180) -- client.scrape.linkedin.jobs(url, timeout=180) -- client.scrape.linkedin.profiles(url, timeout=180) -- client.scrape.linkedin.companies(url, timeout=180) +- client.scrape.linkedin.posts(url, timeout=180) # async +- client.scrape.linkedin.posts_sync(url, timeout=180) # sync +- client.scrape.linkedin.jobs(url, timeout=180) # async +- client.scrape.linkedin.jobs_sync(url, timeout=180) # sync +- client.scrape.linkedin.profiles(url, timeout=180) # async +- client.scrape.linkedin.profiles_sync(url, timeout=180) # sync +- client.scrape.linkedin.companies(url, timeout=180) # async +- client.scrape.linkedin.companies_sync(url, timeout=180) # sync All methods accept: - url: str | list (required) - Single URL or list of URLs @@ -44,8 +48,14 @@ class LinkedInScraper(BaseWebScraper): Example: >>> scraper = LinkedInScraper(bearer_token="token") >>> - >>> # Scrape profile - >>> result = scraper.profiles( + >>> # Scrape profile (async) + >>> result = await scraper.profiles( + ... url="https://linkedin.com/in/johndoe", + ... timeout=180 + ... ) + >>> + >>> # Scrape profile (sync) + >>> result = scraper.profiles_sync( ... url="https://linkedin.com/in/johndoe", ... timeout=180 ... ) @@ -65,13 +75,13 @@ class LinkedInScraper(BaseWebScraper): # POSTS EXTRACTION (URL-based) # ============================================================================ - async def posts_async( + async def posts( self, url: Union[str, List[str]], timeout: int = DEFAULT_TIMEOUT_SHORT, ) -> Union[ScrapeResult, List[ScrapeResult]]: """ - Scrape LinkedIn posts from URLs (async). + Scrape LinkedIn posts from URLs. Uses standard async workflow: trigger job, poll until ready, then fetch results. @@ -83,7 +93,7 @@ async def posts_async( ScrapeResult or List[ScrapeResult] Example: - >>> result = await scraper.posts_async( + >>> result = await scraper.posts( ... url="https://linkedin.com/feed/update/urn:li:activity:123", ... timeout=180 ... ) @@ -96,65 +106,67 @@ async def posts_async( return await self._scrape_urls(url=url, dataset_id=self.DATASET_ID_POSTS, timeout=timeout) - def posts( + + def posts_sync( self, url: Union[str, List[str]], timeout: int = DEFAULT_TIMEOUT_SHORT, ) -> Union[ScrapeResult, List[ScrapeResult]]: """ - Scrape LinkedIn posts (sync wrapper). + Scrape LinkedIn posts from URLs (sync version). - See posts_async() for documentation. + See posts() for full documentation. """ - async def _run(): async with self.engine: - return await self.posts_async(url, timeout) - + return await self.posts(url, timeout) return asyncio.run(_run()) # ============================================================================ # POSTS TRIGGER/STATUS/FETCH (Manual Control) # ============================================================================ - async def posts_trigger_async(self, url: Union[str, List[str]]) -> ScrapeJob: - """Trigger LinkedIn posts scrape (async - manual control).""" + async def posts_trigger(self, url: Union[str, List[str]]) -> ScrapeJob: + """Trigger LinkedIn posts scrape (manual control).""" sdk_function = get_caller_function_name() return await self._trigger_scrape_async( urls=url, dataset_id=self.DATASET_ID_POSTS, sdk_function=sdk_function or "posts_trigger" ) - def posts_trigger(self, url: Union[str, List[str]]) -> ScrapeJob: - """Trigger LinkedIn posts scrape (sync wrapper).""" - return asyncio.run(self.posts_trigger_async(url)) - async def posts_status_async(self, snapshot_id: str) -> str: - """Check LinkedIn posts scrape status (async).""" + def posts_trigger_sync(self, url: Union[str, List[str]]) -> ScrapeJob: + """Trigger LinkedIn posts scrape (sync version).""" + return asyncio.run(self.posts_trigger(url)) + + async def posts_status(self, snapshot_id: str) -> str: + """Check LinkedIn posts scrape status.""" return await self._check_status_async(snapshot_id) - def posts_status(self, snapshot_id: str) -> str: - """Check LinkedIn posts scrape status (sync wrapper).""" - return asyncio.run(self.posts_status_async(snapshot_id)) - async def posts_fetch_async(self, snapshot_id: str) -> Any: - """Fetch LinkedIn posts scrape results (async).""" + def posts_status_sync(self, snapshot_id: str) -> str: + """Check LinkedIn posts scrape status (sync version).""" + return asyncio.run(self.posts_status(snapshot_id)) + + async def posts_fetch(self, snapshot_id: str) -> Any: + """Fetch LinkedIn posts scrape results.""" return await self._fetch_results_async(snapshot_id) - def posts_fetch(self, snapshot_id: str) -> Any: - """Fetch LinkedIn posts scrape results (sync wrapper).""" - return asyncio.run(self.posts_fetch_async(snapshot_id)) + + def posts_fetch_sync(self, snapshot_id: str) -> Any: + """Fetch LinkedIn posts scrape results (sync version).""" + return asyncio.run(self.posts_fetch(snapshot_id)) # ============================================================================ # JOBS EXTRACTION (URL-based) # ============================================================================ - async def jobs_async( + async def jobs( self, url: Union[str, List[str]], timeout: int = DEFAULT_TIMEOUT_SHORT, ) -> Union[ScrapeResult, List[ScrapeResult]]: """ - Scrape LinkedIn jobs from URLs (async). + Scrape LinkedIn jobs from URLs. Uses standard async workflow: trigger job, poll until ready, then fetch results. @@ -166,7 +178,7 @@ async def jobs_async( ScrapeResult or List[ScrapeResult] Example: - >>> result = await scraper.jobs_async( + >>> result = await scraper.jobs( ... url="https://linkedin.com/jobs/view/123456", ... timeout=180 ... ) @@ -178,61 +190,63 @@ async def jobs_async( return await self._scrape_urls(url=url, dataset_id=self.DATASET_ID_JOBS, timeout=timeout) - def jobs( + + def jobs_sync( self, url: Union[str, List[str]], timeout: int = DEFAULT_TIMEOUT_SHORT, ) -> Union[ScrapeResult, List[ScrapeResult]]: - """Scrape LinkedIn jobs (sync wrapper).""" - + """Scrape LinkedIn jobs from URLs (sync version).""" async def _run(): async with self.engine: - return await self.jobs_async(url, timeout) - + return await self.jobs(url, timeout) return asyncio.run(_run()) # ============================================================================ # JOBS TRIGGER/STATUS/FETCH (Manual Control) # ============================================================================ - async def jobs_trigger_async(self, url: Union[str, List[str]]) -> ScrapeJob: - """Trigger LinkedIn jobs scrape (async - manual control).""" + async def jobs_trigger(self, url: Union[str, List[str]]) -> ScrapeJob: + """Trigger LinkedIn jobs scrape (manual control).""" sdk_function = get_caller_function_name() return await self._trigger_scrape_async( urls=url, dataset_id=self.DATASET_ID_JOBS, sdk_function=sdk_function or "jobs_trigger" ) - def jobs_trigger(self, url: Union[str, List[str]]) -> ScrapeJob: - """Trigger LinkedIn jobs scrape (sync wrapper).""" - return asyncio.run(self.jobs_trigger_async(url)) - async def jobs_status_async(self, snapshot_id: str) -> str: - """Check LinkedIn jobs scrape status (async).""" + def jobs_trigger_sync(self, url: Union[str, List[str]]) -> ScrapeJob: + """Trigger LinkedIn jobs scrape (sync version).""" + return asyncio.run(self.jobs_trigger(url)) + + async def jobs_status(self, snapshot_id: str) -> str: + """Check LinkedIn jobs scrape status.""" return await self._check_status_async(snapshot_id) - def jobs_status(self, snapshot_id: str) -> str: - """Check LinkedIn jobs scrape status (sync wrapper).""" - return asyncio.run(self.jobs_status_async(snapshot_id)) - async def jobs_fetch_async(self, snapshot_id: str) -> Any: - """Fetch LinkedIn jobs scrape results (async).""" + def jobs_status_sync(self, snapshot_id: str) -> str: + """Check LinkedIn jobs scrape status (sync version).""" + return asyncio.run(self.jobs_status(snapshot_id)) + + async def jobs_fetch(self, snapshot_id: str) -> Any: + """Fetch LinkedIn jobs scrape results.""" return await self._fetch_results_async(snapshot_id) - def jobs_fetch(self, snapshot_id: str) -> Any: - """Fetch LinkedIn jobs scrape results (sync wrapper).""" - return asyncio.run(self.jobs_fetch_async(snapshot_id)) + + def jobs_fetch_sync(self, snapshot_id: str) -> Any: + """Fetch LinkedIn jobs scrape results (sync version).""" + return asyncio.run(self.jobs_fetch(snapshot_id)) # ============================================================================ # PROFILES EXTRACTION (URL-based) # ============================================================================ - async def profiles_async( + async def profiles( self, url: Union[str, List[str]], timeout: int = DEFAULT_TIMEOUT_SHORT, ) -> Union[ScrapeResult, List[ScrapeResult]]: """ - Scrape LinkedIn profiles from URLs (async). + Scrape LinkedIn profiles from URLs. Uses standard async workflow: trigger job, poll until ready, then fetch results. @@ -244,7 +258,7 @@ async def profiles_async( ScrapeResult or List[ScrapeResult] Example: - >>> result = await scraper.profiles_async( + >>> result = await scraper.profiles( ... url="https://linkedin.com/in/johndoe", ... timeout=180 ... ) @@ -256,59 +270,63 @@ async def profiles_async( return await self._scrape_urls(url=url, dataset_id=self.DATASET_ID, timeout=timeout) - def profiles( + + def profiles_sync( self, url: Union[str, List[str]], timeout: int = DEFAULT_TIMEOUT_SHORT, ) -> Union[ScrapeResult, List[ScrapeResult]]: - """Scrape LinkedIn profiles (sync wrapper).""" - + """Scrape LinkedIn profiles from URLs (sync version).""" async def _run(): async with self.engine: - return await self.profiles_async(url, timeout) - + return await self.profiles(url, timeout) return asyncio.run(_run()) - # --- Trigger Interface (Manual Control) --- + # ============================================================================ + # PROFILES TRIGGER/STATUS/FETCH (Manual Control) + # ============================================================================ - async def profiles_trigger_async(self, url: Union[str, List[str]]) -> ScrapeJob: - """Trigger LinkedIn profiles scrape (async - manual control).""" + async def profiles_trigger(self, url: Union[str, List[str]]) -> ScrapeJob: + """Trigger LinkedIn profiles scrape (manual control).""" sdk_function = get_caller_function_name() return await self._trigger_scrape_async( urls=url, sdk_function=sdk_function or "profiles_trigger" ) - def profiles_trigger(self, url: Union[str, List[str]]) -> ScrapeJob: - """Trigger LinkedIn profiles scrape (sync wrapper).""" - return asyncio.run(self.profiles_trigger_async(url)) - async def profiles_status_async(self, snapshot_id: str) -> str: - """Check LinkedIn profiles scrape status (async).""" + def profiles_trigger_sync(self, url: Union[str, List[str]]) -> ScrapeJob: + """Trigger LinkedIn profiles scrape (sync version).""" + return asyncio.run(self.profiles_trigger(url)) + + async def profiles_status(self, snapshot_id: str) -> str: + """Check LinkedIn profiles scrape status.""" return await self._check_status_async(snapshot_id) - def profiles_status(self, snapshot_id: str) -> str: - """Check LinkedIn profiles scrape status (sync wrapper).""" - return asyncio.run(self.profiles_status_async(snapshot_id)) - async def profiles_fetch_async(self, snapshot_id: str) -> Any: - """Fetch LinkedIn profiles scrape results (async).""" + def profiles_status_sync(self, snapshot_id: str) -> str: + """Check LinkedIn profiles scrape status (sync version).""" + return asyncio.run(self.profiles_status(snapshot_id)) + + async def profiles_fetch(self, snapshot_id: str) -> Any: + """Fetch LinkedIn profiles scrape results.""" return await self._fetch_results_async(snapshot_id) - def profiles_fetch(self, snapshot_id: str) -> Any: - """Fetch LinkedIn profiles scrape results (sync wrapper).""" - return asyncio.run(self.profiles_fetch_async(snapshot_id)) + + def profiles_fetch_sync(self, snapshot_id: str) -> Any: + """Fetch LinkedIn profiles scrape results (sync version).""" + return asyncio.run(self.profiles_fetch(snapshot_id)) # ============================================================================ # COMPANIES EXTRACTION (URL-based) # ============================================================================ - async def companies_async( + async def companies( self, url: Union[str, List[str]], timeout: int = DEFAULT_TIMEOUT_SHORT, ) -> Union[ScrapeResult, List[ScrapeResult]]: """ - Scrape LinkedIn companies from URLs (async). + Scrape LinkedIn companies from URLs. Uses standard async workflow: trigger job, poll until ready, then fetch results. @@ -320,7 +338,7 @@ async def companies_async( ScrapeResult or List[ScrapeResult] Example: - >>> result = await scraper.companies_async( + >>> result = await scraper.companies( ... url="https://linkedin.com/company/microsoft", ... timeout=180 ... ) @@ -334,25 +352,24 @@ async def companies_async( url=url, dataset_id=self.DATASET_ID_COMPANIES, timeout=timeout ) - def companies( + + def companies_sync( self, url: Union[str, List[str]], timeout: int = DEFAULT_TIMEOUT_SHORT, ) -> Union[ScrapeResult, List[ScrapeResult]]: - """Scrape LinkedIn companies (sync wrapper).""" - + """Scrape LinkedIn companies from URLs (sync version).""" async def _run(): async with self.engine: - return await self.companies_async(url, timeout) - + return await self.companies(url, timeout) return asyncio.run(_run()) # ============================================================================ # COMPANIES TRIGGER/STATUS/FETCH (Manual Control) # ============================================================================ - async def companies_trigger_async(self, url: Union[str, List[str]]) -> ScrapeJob: - """Trigger LinkedIn companies scrape (async - manual control).""" + async def companies_trigger(self, url: Union[str, List[str]]) -> ScrapeJob: + """Trigger LinkedIn companies scrape (manual control).""" sdk_function = get_caller_function_name() return await self._trigger_scrape_async( urls=url, @@ -360,25 +377,28 @@ async def companies_trigger_async(self, url: Union[str, List[str]]) -> ScrapeJob sdk_function=sdk_function or "companies_trigger", ) - def companies_trigger(self, url: Union[str, List[str]]) -> ScrapeJob: - """Trigger LinkedIn companies scrape (sync wrapper).""" - return asyncio.run(self.companies_trigger_async(url)) - async def companies_status_async(self, snapshot_id: str) -> str: - """Check LinkedIn companies scrape status (async).""" + def companies_trigger_sync(self, url: Union[str, List[str]]) -> ScrapeJob: + """Trigger LinkedIn companies scrape (sync version).""" + return asyncio.run(self.companies_trigger(url)) + + async def companies_status(self, snapshot_id: str) -> str: + """Check LinkedIn companies scrape status.""" return await self._check_status_async(snapshot_id) - def companies_status(self, snapshot_id: str) -> str: - """Check LinkedIn companies scrape status (sync wrapper).""" - return asyncio.run(self.companies_status_async(snapshot_id)) - async def companies_fetch_async(self, snapshot_id: str) -> Any: - """Fetch LinkedIn companies scrape results (async).""" + def companies_status_sync(self, snapshot_id: str) -> str: + """Check LinkedIn companies scrape status (sync version).""" + return asyncio.run(self.companies_status(snapshot_id)) + + async def companies_fetch(self, snapshot_id: str) -> Any: + """Fetch LinkedIn companies scrape results.""" return await self._fetch_results_async(snapshot_id) - def companies_fetch(self, snapshot_id: str) -> Any: - """Fetch LinkedIn companies scrape results (sync wrapper).""" - return asyncio.run(self.companies_fetch_async(snapshot_id)) + + def companies_fetch_sync(self, snapshot_id: str) -> Any: + """Fetch LinkedIn companies scrape results (sync version).""" + return asyncio.run(self.companies_fetch(snapshot_id)) # ============================================================================ # CORE SCRAPING LOGIC (Standard async workflow) diff --git a/src/brightdata/scrapers/linkedin/search.py b/src/brightdata/scrapers/linkedin/search.py index 352a795..c41d558 100644 --- a/src/brightdata/scrapers/linkedin/search.py +++ b/src/brightdata/scrapers/linkedin/search.py @@ -2,9 +2,12 @@ LinkedIn Search Scraper - Discovery/parameter-based operations. Implements: -- client.search.linkedin.posts() - Discover posts by profile and date range -- client.search.linkedin.profiles() - Find profiles by name -- client.search.linkedin.jobs() - Find jobs by keyword/location/filters +- client.search.linkedin.posts() - Discover posts by profile and date range (async) +- client.search.linkedin.posts_sync() - Discover posts by profile and date range (sync) +- client.search.linkedin.profiles() - Find profiles by name (async) +- client.search.linkedin.profiles_sync() - Find profiles by name (sync) +- client.search.linkedin.jobs() - Find jobs by keyword/location/filters (async) +- client.search.linkedin.jobs_sync() - Find jobs by keyword/location/filters (sync) """ import asyncio @@ -30,11 +33,19 @@ class LinkedInSearchScraper: Example: >>> scraper = LinkedInSearchScraper(bearer_token="token") - >>> result = scraper.jobs( + >>> + >>> # Async + >>> result = await scraper.jobs( ... keyword="python developer", ... location="New York", ... remote=True ... ) + >>> + >>> # Sync + >>> result = scraper.jobs_sync( + ... keyword="python developer", + ... location="New York" + ... ) """ # Dataset IDs for different LinkedIn types @@ -65,7 +76,7 @@ def __init__(self, bearer_token: str, engine: Optional[AsyncEngine] = None): # POSTS DISCOVERY (by profile + date range) # ============================================================================ - async def posts_async( + async def posts( self, profile_url: Union[str, List[str]], start_date: Optional[Union[str, List[str]]] = None, @@ -85,7 +96,7 @@ async def posts_async( ScrapeResult with discovered posts Example: - >>> result = await search.posts_async( + >>> result = await search.posts( ... profile_url="https://linkedin.com/in/johndoe", ... start_date="2025-01-01", ... end_date="2025-12-31" @@ -113,7 +124,8 @@ async def posts_async( payload=payload, dataset_id=self.DATASET_ID_POSTS, timeout=timeout ) - def posts( + + def posts_sync( self, profile_url: Union[str, List[str]], start_date: Optional[Union[str, List[str]]] = None, @@ -121,22 +133,20 @@ def posts( timeout: int = DEFAULT_TIMEOUT_SHORT, ) -> ScrapeResult: """ - Discover posts from profile(s) (sync). + Discover posts from profile(s) (sync version). - See posts_async() for documentation. + See posts() for documentation. """ - async def _run(): async with self.engine: - return await self.posts_async(profile_url, start_date, end_date, timeout) - + return await self.posts(profile_url, start_date, end_date, timeout) return asyncio.run(_run()) # ============================================================================ # PROFILES DISCOVERY (by name) # ============================================================================ - async def profiles_async( + async def profiles( self, firstName: Union[str, List[str]], lastName: Optional[Union[str, List[str]]] = None, @@ -154,7 +164,7 @@ async def profiles_async( ScrapeResult with matching profiles Example: - >>> result = await search.profiles_async( + >>> result = await search.profiles( ... firstName="John", ... lastName="Doe" ... ) @@ -177,29 +187,28 @@ async def profiles_async( payload=payload, dataset_id=self.DATASET_ID_PROFILES, timeout=timeout ) - def profiles( + + def profiles_sync( self, firstName: Union[str, List[str]], lastName: Optional[Union[str, List[str]]] = None, timeout: int = DEFAULT_TIMEOUT_SHORT, ) -> ScrapeResult: """ - Find profiles by name (sync). + Find profiles by name (sync version). - See profiles_async() for documentation. + See profiles() for documentation. """ - async def _run(): async with self.engine: - return await self.profiles_async(firstName, lastName, timeout) - + return await self.profiles(firstName, lastName, timeout) return asyncio.run(_run()) # ============================================================================ # JOBS DISCOVERY (by keyword + extensive filters) # ============================================================================ - async def jobs_async( + async def jobs( self, url: Optional[Union[str, List[str]]] = None, location: Optional[Union[str, List[str]]] = None, @@ -233,7 +242,7 @@ async def jobs_async( ScrapeResult with matching jobs Example: - >>> result = await search.jobs_async( + >>> result = await search.jobs( ... keyword="python developer", ... location="New York", ... remote=True, @@ -302,7 +311,8 @@ async def jobs_async( return await self._execute_search(payload=payload, dataset_id=dataset_id, timeout=timeout) - def jobs( + + def jobs_sync( self, url: Optional[Union[str, List[str]]] = None, location: Optional[Union[str, List[str]]] = None, @@ -317,21 +327,13 @@ def jobs( timeout: int = DEFAULT_TIMEOUT_SHORT, ) -> ScrapeResult: """ - Discover jobs (sync). - - See jobs_async() for full documentation. + Discover jobs (sync version). - Example: - >>> result = search.jobs( - ... keyword="python", - ... location="NYC", - ... remote=True - ... ) + See jobs() for full documentation. """ - async def _run(): async with self.engine: - return await self.jobs_async( + return await self.jobs( url=url, location=location, keyword=keyword, @@ -344,7 +346,6 @@ async def _run(): locationRadius=locationRadius, timeout=timeout, ) - return asyncio.run(_run()) # ============================================================================ diff --git a/src/brightdata/sync_client.py b/src/brightdata/sync_client.py new file mode 100644 index 0000000..6896b4d --- /dev/null +++ b/src/brightdata/sync_client.py @@ -0,0 +1,732 @@ +""" +Synchronous client adapter for Bright Data SDK. + +Provides sync interface using persistent event loop for optimal performance. +""" + +import asyncio +from typing import Optional, List, Dict, Any, Union + +from .client import BrightDataClient +from .models import ScrapeResult, SearchResult +from .types import AccountInfo + + +class SyncBrightDataClient: + """ + Synchronous adapter for BrightDataClient. + + Uses a persistent event loop for all operations, providing better + performance than repeated asyncio.run() calls. + + WARNING: This client is NOT thread-safe. For multi-threaded usage, + create a separate SyncBrightDataClient per thread. + + Example: + >>> with SyncBrightDataClient(token="...") as client: + ... zones = client.list_zones() + ... result = client.scrape.amazon.products(url) + """ + + def __init__( + self, + token: Optional[str] = None, + customer_id: Optional[str] = None, + timeout: int = 30, + web_unlocker_zone: Optional[str] = None, + serp_zone: Optional[str] = None, + browser_zone: Optional[str] = None, + auto_create_zones: bool = True, + validate_token: bool = False, + rate_limit: Optional[float] = None, + rate_period: float = 1.0, + ): + """ + Initialize sync client. + + Args: + token: Bright Data API token (or set BRIGHT_DATA_API_TOKEN env var) + customer_id: Customer ID (optional, extracted from token if not provided) + timeout: Default request timeout in seconds + web_unlocker_zone: Zone name for Web Unlocker API + serp_zone: Zone name for SERP API + browser_zone: Zone name for Browser API + auto_create_zones: Automatically create required zones if missing + validate_token: Validate token on initialization + rate_limit: Rate limit (requests per period) + rate_period: Rate limit period in seconds + """ + # Check if we're inside an async context - FIXED logic + try: + asyncio.get_running_loop() + # If we get here, there IS a running loop - this is an error + raise RuntimeError( + "SyncBrightDataClient cannot be used inside async context. " + "Use BrightDataClient with async/await instead." + ) + except RuntimeError as e: + # Only pass if it's the "no running event loop" error + if "no running event loop" not in str(e).lower(): + raise # Re-raise our custom error or other RuntimeErrors + # No running loop - correct for sync usage, continue + + self._async_client = BrightDataClient( + token=token, + customer_id=customer_id, + timeout=timeout, + web_unlocker_zone=web_unlocker_zone, + serp_zone=serp_zone, + browser_zone=browser_zone, + auto_create_zones=auto_create_zones, + validate_token=False, # Will validate during __enter__ + rate_limit=rate_limit, + rate_period=rate_period, + ) + self._validate_token = validate_token + self._loop: Optional[asyncio.AbstractEventLoop] = None + self._scrape: Optional["SyncScrapeService"] = None + self._search: Optional["SyncSearchService"] = None + self._crawler: Optional["SyncCrawlerService"] = None + + def __enter__(self): + """Initialize persistent event loop and async client.""" + # Create persistent loop + self._loop = asyncio.new_event_loop() + asyncio.set_event_loop(self._loop) + + # Initialize async client + self._loop.run_until_complete(self._async_client.__aenter__()) + + # Validate token if requested + if self._validate_token: + is_valid = self._loop.run_until_complete( + self._async_client.test_connection() + ) + if not is_valid: + self.__exit__(None, None, None) + from .exceptions import AuthenticationError + + raise AuthenticationError( + "Token validation failed. Token appears to be invalid." + ) + + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Cleanup async client and event loop.""" + if self._loop is None: + return + + try: + # Cleanup async client + self._loop.run_until_complete( + self._async_client.__aexit__(exc_type, exc_val, exc_tb) + ) + + # Give the event loop a moment to process any remaining callbacks + # This helps prevent "Unclosed client session" warnings + self._loop.run_until_complete(asyncio.sleep(0.05)) + + # Cancel any remaining tasks + pending = asyncio.all_tasks(self._loop) + for task in pending: + task.cancel() + + # Let cancellations propagate + if pending: + self._loop.run_until_complete( + asyncio.gather(*pending, return_exceptions=True) + ) + except Exception: + # Ignore errors during cleanup + pass + finally: + # Close the loop + try: + self._loop.close() + except Exception: + pass + self._loop = None + + def _run(self, coro): + """Run coroutine in persistent loop.""" + if self._loop is None: + raise RuntimeError( + "SyncBrightDataClient not initialized. " + "Use: with SyncBrightDataClient() as client: ..." + ) + return self._loop.run_until_complete(coro) + + # ======================================== + # Utility Methods + # ======================================== + + def list_zones(self) -> List[Dict[str, Any]]: + """List all active zones.""" + return self._run(self._async_client.list_zones()) + + def delete_zone(self, zone_name: str) -> None: + """Delete a zone.""" + return self._run(self._async_client.delete_zone(zone_name)) + + def get_account_info(self, refresh: bool = False) -> AccountInfo: + """Get account information.""" + return self._run(self._async_client.get_account_info(refresh=refresh)) + + def test_connection(self) -> bool: + """Test API connection.""" + return self._run(self._async_client.test_connection()) + + def scrape_url(self, url, **kwargs): + """Scrape URL using Web Unlocker.""" + return self._run(self._async_client.scrape_url(url, **kwargs)) + + # ======================================== + # Service Properties + # ======================================== + + @property + def scrape(self) -> "SyncScrapeService": + """Access scraping services (sync).""" + if self._scrape is None: + self._scrape = SyncScrapeService(self._async_client.scrape, self._loop) + return self._scrape + + @property + def search(self) -> "SyncSearchService": + """Access search services (sync).""" + if self._search is None: + self._search = SyncSearchService(self._async_client.search, self._loop) + return self._search + + @property + def crawler(self) -> "SyncCrawlerService": + """Access crawler services (sync).""" + if self._crawler is None: + self._crawler = SyncCrawlerService(self._async_client.crawler, self._loop) + return self._crawler + + @property + def token(self) -> str: + """Get API token.""" + return self._async_client.token + + def __repr__(self) -> str: + """String representation.""" + token_preview = ( + f"{self.token[:10]}...{self.token[-5:]}" if self.token else "None" + ) + status = "Initialized" if self._loop else "Not initialized" + return f"" + + +# ============================================================================ +# SYNC SCRAPE SERVICE +# ============================================================================ + + +class SyncScrapeService: + """Sync wrapper for ScrapeService.""" + + def __init__(self, async_service, loop): + self._async = async_service + self._loop = loop + self._amazon = None + self._linkedin = None + self._instagram = None + self._facebook = None + self._chatgpt = None + + @property + def amazon(self) -> "SyncAmazonScraper": + if self._amazon is None: + self._amazon = SyncAmazonScraper(self._async.amazon, self._loop) + return self._amazon + + @property + def linkedin(self) -> "SyncLinkedInScraper": + if self._linkedin is None: + self._linkedin = SyncLinkedInScraper(self._async.linkedin, self._loop) + return self._linkedin + + @property + def instagram(self) -> "SyncInstagramScraper": + if self._instagram is None: + self._instagram = SyncInstagramScraper(self._async.instagram, self._loop) + return self._instagram + + @property + def facebook(self) -> "SyncFacebookScraper": + if self._facebook is None: + self._facebook = SyncFacebookScraper(self._async.facebook, self._loop) + return self._facebook + + @property + def chatgpt(self) -> "SyncChatGPTScraper": + if self._chatgpt is None: + self._chatgpt = SyncChatGPTScraper(self._async.chatgpt, self._loop) + return self._chatgpt + + + +class SyncAmazonScraper: + """Sync wrapper for AmazonScraper - COMPLETE with all methods.""" + + def __init__(self, async_scraper, loop): + self._async = async_scraper + self._loop = loop + + # Products + def products(self, url, **kwargs) -> ScrapeResult: + """Scrape Amazon product details.""" + return self._loop.run_until_complete(self._async.products(url, **kwargs)) + + def products_trigger(self, url, **kwargs): + """Trigger Amazon products scrape.""" + return self._loop.run_until_complete( + self._async.products_trigger(url, **kwargs) + ) + + def products_status(self, snapshot_id): + """Check Amazon products scrape status.""" + return self._loop.run_until_complete( + self._async.products_status(snapshot_id) + ) + + def products_fetch(self, snapshot_id): + """Fetch Amazon products scrape results.""" + return self._loop.run_until_complete(self._async.products_fetch(snapshot_id)) + + # Reviews + def reviews(self, url, **kwargs) -> ScrapeResult: + """Scrape Amazon reviews.""" + return self._loop.run_until_complete(self._async.reviews(url, **kwargs)) + + def reviews_trigger(self, url, **kwargs): + """Trigger Amazon reviews scrape.""" + return self._loop.run_until_complete( + self._async.reviews_trigger(url, **kwargs) + ) + + def reviews_status(self, snapshot_id): + """Check Amazon reviews scrape status.""" + return self._loop.run_until_complete(self._async.reviews_status(snapshot_id)) + + def reviews_fetch(self, snapshot_id): + """Fetch Amazon reviews scrape results.""" + return self._loop.run_until_complete(self._async.reviews_fetch(snapshot_id)) + + # Sellers + def sellers(self, url, **kwargs) -> ScrapeResult: + """Scrape Amazon sellers.""" + return self._loop.run_until_complete(self._async.sellers(url, **kwargs)) + + def sellers_trigger(self, url, **kwargs): + """Trigger Amazon sellers scrape.""" + return self._loop.run_until_complete( + self._async.sellers_trigger(url, **kwargs) + ) + + def sellers_status(self, snapshot_id): + """Check Amazon sellers scrape status.""" + return self._loop.run_until_complete(self._async.sellers_status(snapshot_id)) + + def sellers_fetch(self, snapshot_id): + """Fetch Amazon sellers scrape results.""" + return self._loop.run_until_complete(self._async.sellers_fetch(snapshot_id)) + + +class SyncLinkedInScraper: + """Sync wrapper for LinkedInScraper - COMPLETE with all methods.""" + + def __init__(self, async_scraper, loop): + self._async = async_scraper + self._loop = loop + + # Posts - Call async methods (posts) not sync wrappers (posts_sync) + # because sync wrappers use asyncio.run() which conflicts with our persistent loop + def posts(self, url, **kwargs): + return self._loop.run_until_complete(self._async.posts(url, **kwargs)) + + def posts_trigger(self, url, **kwargs): + return self._loop.run_until_complete(self._async.posts_trigger(url, **kwargs)) + + def posts_status(self, snapshot_id): + return self._loop.run_until_complete(self._async.posts_status(snapshot_id)) + + def posts_fetch(self, snapshot_id): + return self._loop.run_until_complete(self._async.posts_fetch(snapshot_id)) + + # Jobs + def jobs(self, url, **kwargs): + return self._loop.run_until_complete(self._async.jobs(url, **kwargs)) + + def jobs_trigger(self, url, **kwargs): + return self._loop.run_until_complete(self._async.jobs_trigger(url, **kwargs)) + + def jobs_status(self, snapshot_id): + return self._loop.run_until_complete(self._async.jobs_status(snapshot_id)) + + def jobs_fetch(self, snapshot_id): + return self._loop.run_until_complete(self._async.jobs_fetch(snapshot_id)) + + # Profiles + def profiles(self, url, **kwargs): + return self._loop.run_until_complete(self._async.profiles(url, **kwargs)) + + def profiles_trigger(self, url, **kwargs): + return self._loop.run_until_complete(self._async.profiles_trigger(url, **kwargs)) + + def profiles_status(self, snapshot_id): + return self._loop.run_until_complete(self._async.profiles_status(snapshot_id)) + + def profiles_fetch(self, snapshot_id): + return self._loop.run_until_complete(self._async.profiles_fetch(snapshot_id)) + + # Companies + def companies(self, url, **kwargs): + return self._loop.run_until_complete(self._async.companies(url, **kwargs)) + + def companies_trigger(self, url, **kwargs): + return self._loop.run_until_complete(self._async.companies_trigger(url, **kwargs)) + + def companies_status(self, snapshot_id): + return self._loop.run_until_complete(self._async.companies_status(snapshot_id)) + + def companies_fetch(self, snapshot_id): + return self._loop.run_until_complete(self._async.companies_fetch(snapshot_id)) + + +class SyncInstagramScraper: + """Sync wrapper for InstagramScraper - COMPLETE with all methods.""" + + def __init__(self, async_scraper, loop): + self._async = async_scraper + self._loop = loop + + # Profiles - NOTE: Must call async methods (not _sync wrappers) because they use asyncio.run() + def profiles(self, url, **kwargs): + return self._loop.run_until_complete(self._async.profiles(url, **kwargs)) + + def profiles_trigger(self, url, **kwargs): + return self._loop.run_until_complete( + self._async.profiles_trigger(url, **kwargs) + ) + + def profiles_status(self, snapshot_id): + return self._loop.run_until_complete(self._async.profiles_status(snapshot_id)) + + def profiles_fetch(self, snapshot_id): + return self._loop.run_until_complete(self._async.profiles_fetch(snapshot_id)) + + # Posts + def posts(self, url, **kwargs): + return self._loop.run_until_complete(self._async.posts(url, **kwargs)) + + def posts_trigger(self, url, **kwargs): + return self._loop.run_until_complete(self._async.posts_trigger(url, **kwargs)) + + def posts_status(self, snapshot_id): + return self._loop.run_until_complete(self._async.posts_status(snapshot_id)) + + def posts_fetch(self, snapshot_id): + return self._loop.run_until_complete(self._async.posts_fetch(snapshot_id)) + + # Comments + def comments(self, url, **kwargs): + return self._loop.run_until_complete(self._async.comments(url, **kwargs)) + + def comments_trigger(self, url, **kwargs): + return self._loop.run_until_complete( + self._async.comments_trigger(url, **kwargs) + ) + + def comments_status(self, snapshot_id): + return self._loop.run_until_complete(self._async.comments_status(snapshot_id)) + + def comments_fetch(self, snapshot_id): + return self._loop.run_until_complete(self._async.comments_fetch(snapshot_id)) + + # Reels + def reels(self, url, **kwargs): + return self._loop.run_until_complete(self._async.reels(url, **kwargs)) + + def reels_trigger(self, url, **kwargs): + return self._loop.run_until_complete(self._async.reels_trigger(url, **kwargs)) + + def reels_status(self, snapshot_id): + return self._loop.run_until_complete(self._async.reels_status(snapshot_id)) + + def reels_fetch(self, snapshot_id): + return self._loop.run_until_complete(self._async.reels_fetch(snapshot_id)) + + +class SyncFacebookScraper: + """Sync wrapper for FacebookScraper - COMPLETE with all methods.""" + + def __init__(self, async_scraper, loop): + self._async = async_scraper + self._loop = loop + + # Posts by profile - NOTE: Must call async methods (not _sync wrappers) because they use asyncio.run() + def posts_by_profile(self, url, **kwargs): + return self._loop.run_until_complete( + self._async.posts_by_profile(url, **kwargs) + ) + + def posts_by_profile_trigger(self, url, **kwargs): + return self._loop.run_until_complete( + self._async.posts_by_profile_trigger(url, **kwargs) + ) + + def posts_by_profile_status(self, snapshot_id): + return self._loop.run_until_complete( + self._async.posts_by_profile_status(snapshot_id) + ) + + def posts_by_profile_fetch(self, snapshot_id): + return self._loop.run_until_complete( + self._async.posts_by_profile_fetch(snapshot_id) + ) + + # Posts by group + def posts_by_group(self, url, **kwargs): + return self._loop.run_until_complete( + self._async.posts_by_group(url, **kwargs) + ) + + def posts_by_group_trigger(self, url, **kwargs): + return self._loop.run_until_complete( + self._async.posts_by_group_trigger(url, **kwargs) + ) + + def posts_by_group_status(self, snapshot_id): + return self._loop.run_until_complete( + self._async.posts_by_group_status(snapshot_id) + ) + + def posts_by_group_fetch(self, snapshot_id): + return self._loop.run_until_complete( + self._async.posts_by_group_fetch(snapshot_id) + ) + + # Posts by URL + def posts_by_url(self, url, **kwargs): + return self._loop.run_until_complete(self._async.posts_by_url(url, **kwargs)) + + def posts_by_url_trigger(self, url, **kwargs): + return self._loop.run_until_complete( + self._async.posts_by_url_trigger(url, **kwargs) + ) + + def posts_by_url_status(self, snapshot_id): + return self._loop.run_until_complete( + self._async.posts_by_url_status(snapshot_id) + ) + + def posts_by_url_fetch(self, snapshot_id): + return self._loop.run_until_complete( + self._async.posts_by_url_fetch(snapshot_id) + ) + + # Comments + def comments(self, url, **kwargs): + return self._loop.run_until_complete(self._async.comments(url, **kwargs)) + + def comments_trigger(self, url, **kwargs): + return self._loop.run_until_complete( + self._async.comments_trigger(url, **kwargs) + ) + + def comments_status(self, snapshot_id): + return self._loop.run_until_complete(self._async.comments_status(snapshot_id)) + + def comments_fetch(self, snapshot_id): + return self._loop.run_until_complete(self._async.comments_fetch(snapshot_id)) + + # Reels + def reels(self, url, **kwargs): + return self._loop.run_until_complete(self._async.reels(url, **kwargs)) + + def reels_trigger(self, url, **kwargs): + return self._loop.run_until_complete(self._async.reels_trigger(url, **kwargs)) + + def reels_status(self, snapshot_id): + return self._loop.run_until_complete(self._async.reels_status(snapshot_id)) + + def reels_fetch(self, snapshot_id): + return self._loop.run_until_complete(self._async.reels_fetch(snapshot_id)) + + +class SyncChatGPTScraper: + """Sync wrapper for ChatGPTScraper - COMPLETE with all methods.""" + + def __init__(self, async_scraper, loop): + self._async = async_scraper + self._loop = loop + + # Prompt - Call async methods (prompt) not sync wrappers (prompt_sync) + # because sync wrappers use asyncio.run() which conflicts with our persistent loop + def prompt(self, prompt_text, **kwargs): + return self._loop.run_until_complete( + self._async.prompt(prompt_text, **kwargs) + ) + + def prompt_trigger(self, prompt_text, **kwargs): + return self._loop.run_until_complete( + self._async.prompt_trigger(prompt_text, **kwargs) + ) + + def prompt_status(self, snapshot_id): + return self._loop.run_until_complete(self._async.prompt_status(snapshot_id)) + + def prompt_fetch(self, snapshot_id): + return self._loop.run_until_complete(self._async.prompt_fetch(snapshot_id)) + + # Prompts (batch) + def prompts(self, prompts, **kwargs): + return self._loop.run_until_complete(self._async.prompts(prompts, **kwargs)) + + def prompts_trigger(self, prompts, **kwargs): + return self._loop.run_until_complete( + self._async.prompts_trigger(prompts, **kwargs) + ) + + def prompts_status(self, snapshot_id): + return self._loop.run_until_complete(self._async.prompts_status(snapshot_id)) + + def prompts_fetch(self, snapshot_id): + return self._loop.run_until_complete(self._async.prompts_fetch(snapshot_id)) + + +# ============================================================================ +# SYNC SEARCH SERVICE +# ============================================================================ + + +class SyncSearchService: + """Sync wrapper for SearchService - COMPLETE.""" + + def __init__(self, async_service, loop): + self._async = async_service + self._loop = loop + self._amazon = None + self._linkedin = None + self._instagram = None + + def google(self, query, **kwargs) -> SearchResult: + """Search Google.""" + return self._loop.run_until_complete(self._async.google(query, **kwargs)) + + def bing(self, query, **kwargs) -> SearchResult: + """Search Bing.""" + return self._loop.run_until_complete(self._async.bing(query, **kwargs)) + + def yandex(self, query, **kwargs) -> SearchResult: + """Search Yandex.""" + return self._loop.run_until_complete(self._async.yandex(query, **kwargs)) + + @property + def amazon(self) -> "SyncAmazonSearchScraper": + """Amazon search service.""" + if self._amazon is None: + self._amazon = SyncAmazonSearchScraper(self._async.amazon, self._loop) + return self._amazon + + @property + def linkedin(self) -> "SyncLinkedInSearchScraper": + """LinkedIn search service.""" + if self._linkedin is None: + self._linkedin = SyncLinkedInSearchScraper(self._async.linkedin, self._loop) + return self._linkedin + + @property + def instagram(self) -> "SyncInstagramSearchScraper": + """Instagram search service.""" + if self._instagram is None: + self._instagram = SyncInstagramSearchScraper( + self._async.instagram, self._loop + ) + return self._instagram + + @property + def chatGPT(self) -> "SyncChatGPTSearchService": + """ChatGPT search service.""" + return SyncChatGPTSearchService(self._async.chatGPT, self._loop) + + +class SyncAmazonSearchScraper: + """Sync wrapper for AmazonSearchScraper.""" + + def __init__(self, async_scraper, loop): + self._async = async_scraper + self._loop = loop + + def products(self, keyword=None, **kwargs): + return self._loop.run_until_complete(self._async.products(keyword=keyword, **kwargs)) + + +class SyncLinkedInSearchScraper: + """Sync wrapper for LinkedInSearchScraper.""" + + def __init__(self, async_scraper, loop): + self._async = async_scraper + self._loop = loop + + def posts(self, profile_url, **kwargs): + return self._loop.run_until_complete( + self._async.posts(profile_url, **kwargs) + ) + + def profiles(self, **kwargs): + return self._loop.run_until_complete(self._async.profiles(**kwargs)) + + def jobs(self, **kwargs): + return self._loop.run_until_complete(self._async.jobs(**kwargs)) + + +class SyncInstagramSearchScraper: + """Sync wrapper for InstagramSearchScraper.""" + + def __init__(self, async_scraper, loop): + self._async = async_scraper + self._loop = loop + + def posts(self, url, **kwargs): + return self._loop.run_until_complete(self._async.posts(url, **kwargs)) + + def reels(self, url, **kwargs): + return self._loop.run_until_complete(self._async.reels(url, **kwargs)) + + +class SyncChatGPTSearchService: + """Sync wrapper for ChatGPTSearchService.""" + + def __init__(self, async_service, loop): + self._async = async_service + self._loop = loop + + def chatGPT(self, prompt, **kwargs): + """Send prompt(s) to ChatGPT via search service.""" + return self._loop.run_until_complete(self._async.chatGPT(prompt, **kwargs)) + + +# ============================================================================ +# SYNC CRAWLER SERVICE +# ============================================================================ + + +class SyncCrawlerService: + """Sync wrapper for CrawlerService.""" + + def __init__(self, async_service, loop): + self._async = async_service + self._loop = loop + + def crawl(self, url, **kwargs): + """Crawl a URL.""" + return self._loop.run_until_complete(self._async.crawl(url, **kwargs)) + + def scrape(self, url, **kwargs): + """Scrape a URL.""" + return self._loop.run_until_complete(self._async.scrape(url, **kwargs)) diff --git a/tests/e2e/test_client_e2e.py b/tests/e2e/test_client_e2e.py index 9b96d2d..723b16e 100644 --- a/tests/e2e/test_client_e2e.py +++ b/tests/e2e/test_client_e2e.py @@ -14,7 +14,7 @@ except ImportError: pass -from brightdata import BrightDataClient +from brightdata import BrightDataClient, SyncBrightDataClient @pytest.fixture @@ -62,7 +62,6 @@ def test_scrape_service_has_specialized_scrapers(self, api_token): scrape = client.scrape # All scrapers should now be accessible - assert scrape.generic is not None assert scrape.amazon is not None assert scrape.linkedin is not None assert scrape.chatgpt is not None @@ -101,26 +100,25 @@ def test_crawler_service_has_crawl_methods(self, api_token): assert callable(crawler.sitemap) -class TestGenericScraperAccess: - """Test generic scraper through hierarchical access.""" +class TestWebUnlocker: + """Test Web Unlocker scraping via scrape_url().""" @pytest.mark.asyncio - async def test_generic_scraper_async(self, client): - """Test generic scraper through client.scrape.generic.url_async().""" - result = await client.scrape.generic.url_async(url="https://httpbin.org/html") + async def test_scrape_url_async(self, client): + """Test scrape_url() async.""" + result = await client.scrape_url(url="https://httpbin.org/html") assert result is not None assert hasattr(result, "success") assert hasattr(result, "data") - def test_generic_scraper_sync(self, api_token): - """Test generic scraper synchronously.""" - client = BrightDataClient(token=api_token) - - result = client.scrape.generic.url(url="https://httpbin.org/html") + def test_scrape_url_sync(self, api_token): + """Test scrape_url() synchronously using SyncBrightDataClient.""" + with SyncBrightDataClient(token=api_token) as client: + result = client.scrape_url(url="https://httpbin.org/html") - assert result is not None - assert result.success or result.error is not None + assert result is not None + assert result.success or result.error is not None class TestConnectionVerification: @@ -204,7 +202,7 @@ def test_hierarchical_access_is_intuitive(self, api_token): assert hasattr(chatgpt_scraper, "prompt") print("\nโœ… Hierarchical access pattern is intuitive!") - print(" - client.scrape.generic.url() โœ… (working)") + print(" - client.scrape_url() โœ… (working)") print(" - client.scrape.amazon.products() โœ… (working)") print(" - client.scrape.linkedin.jobs() โœ… (working)") print(" - client.scrape.chatgpt.prompt() โœ… (working)") @@ -310,7 +308,7 @@ def demo_client_usage(): print("โœ… Services available: scrape, search, crawler") print() print("Example usage:") - print(" result = client.scrape.generic.url('https://example.com')") + print(" result = client.scrape_url('https://example.com')") print(" results = client.search.google('python scraping')") print(" pages = client.crawler.discover('https://example.com')") except Exception as e: diff --git a/tests/enes/amazon.py b/tests/enes/amazon.py index 76b141c..e9fe520 100644 --- a/tests/enes/amazon.py +++ b/tests/enes/amazon.py @@ -30,7 +30,7 @@ async def test_amazon_products(): print("๐Ÿ“ Product URL: https://www.amazon.com/dp/B0CRMZHDG8") try: - result = await scraper.products_async( + result = await scraper.products( url="https://www.amazon.com/dp/B0CRMZHDG8", timeout=240 ) @@ -82,7 +82,7 @@ async def test_amazon_reviews(): print("๐Ÿ“‹ Parameters: pastDays=30, numOfReviews=10") try: - result = await scraper.reviews_async( + result = await scraper.reviews( url="https://www.amazon.com/dp/B0CRMZHDG8", pastDays=30, numOfReviews=10, diff --git a/tests/enes/amazon_search.py b/tests/enes/amazon_search.py index ef6f44f..e4a1831 100644 --- a/tests/enes/amazon_search.py +++ b/tests/enes/amazon_search.py @@ -6,6 +6,9 @@ - client.search.amazon.products(keyword="laptop", min_price=..., etc.) This is DIFFERENT from the old URL-based approach which gets blocked. + +python -m tests.enes.amazon_search +python tests/enes/amazon_search.py """ import sys @@ -43,7 +46,7 @@ async def test_new_amazon_search_api(): try: async with client.engine: - result = await client.search.amazon.products_async(keyword="laptop") + result = await client.search.amazon.products(keyword="laptop") print(" โœ… API call succeeded") print(f" Success: {result.success}") @@ -80,7 +83,7 @@ async def test_new_amazon_search_api(): try: async with client.engine: - result = await client.search.amazon.products_async( + result = await client.search.amazon.products( keyword="headphones", min_price=5000, max_price=20000 ) @@ -115,7 +118,7 @@ async def test_new_amazon_search_api(): try: async with client.engine: - result = await client.search.amazon.products_async( + result = await client.search.amazon.products( keyword="phone charger", prime_eligible=True ) diff --git a/tests/enes/chatgpt.py b/tests/enes/chatgpt.py index 3088863..dc66153 100644 --- a/tests/enes/chatgpt.py +++ b/tests/enes/chatgpt.py @@ -29,7 +29,7 @@ async def test_chatgpt_single_prompt(): print("๐Ÿ“‹ Prompt: 'Explain async programming in Python in 2 sentences'") try: - result = await scraper.prompt_async( + result = await scraper.prompt( prompt="Explain async programming in Python in 2 sentences", web_search=False, poll_timeout=180, @@ -83,7 +83,7 @@ async def test_chatgpt_web_search(): print("๐ŸŒ Web search: Enabled") try: - result = await scraper.prompt_async( + result = await scraper.prompt( prompt="What are the latest developments in AI in 2025?", web_search=True, poll_timeout=180, @@ -140,7 +140,7 @@ async def test_chatgpt_multiple_prompts(): print("๐Ÿ“‹ Prompts: ['What is Python?', 'What is JavaScript?']") try: - result = await scraper.prompts_async( + result = await scraper.prompts( prompts=[ "What is Python in one sentence?", "What is JavaScript in one sentence?", diff --git a/tests/enes/chatgpt_02.py b/tests/enes/chatgpt_02.py index cabf2db..476b8c3 100644 --- a/tests/enes/chatgpt_02.py +++ b/tests/enes/chatgpt_02.py @@ -39,7 +39,7 @@ async def test_chatgpt(): print(" Country: US (default)") scraper = client.scrape.chatgpt - result = await scraper.prompt_async(prompt=prompt, web_search=False, poll_timeout=60) + result = await scraper.prompt(prompt=prompt, web_search=False, poll_timeout=60) if result.success: print(" โœ… Prompt successful!") @@ -72,7 +72,7 @@ async def test_chatgpt(): print(" Web search: True") print(" Country: US") - result = await scraper.prompt_async( + result = await scraper.prompt( prompt=prompt, country="us", web_search=True, poll_timeout=90 ) @@ -97,7 +97,7 @@ async def test_chatgpt(): print(f" Prompts: {prompts}") print(" Countries: ['us', 'us']") - result = await scraper.prompts_async( + result = await scraper.prompts( prompts=prompts, countries=["us", "us"], web_searches=[False, False], @@ -128,7 +128,7 @@ async def test_chatgpt(): print(f" Initial prompt: '{prompt}'") print(f" Follow-up: '{follow_up}'") - result = await scraper.prompt_async( + result = await scraper.prompt( prompt=prompt, additional_prompt=follow_up, web_search=False, poll_timeout=90 ) @@ -186,17 +186,17 @@ async def test_chatgpt(): print(f" Prompt: '{prompt}'") # Trigger only - job = await scraper.prompt_trigger_async(prompt=prompt) + job = await scraper.prompt_trigger(prompt=prompt) print(f" โœ… Triggered job: {job.snapshot_id}") # Check status - status = await scraper.prompt_status_async(job.snapshot_id) + status = await scraper.prompt_status(job.snapshot_id) print(f" Initial status: {status}") # Poll until ready max_attempts = 30 for attempt in range(max_attempts): - status = await scraper.prompt_status_async(job.snapshot_id) + status = await scraper.prompt_status(job.snapshot_id) if status == "ready": print(f" Status ready after {attempt + 1} checks") break @@ -207,7 +207,7 @@ async def test_chatgpt(): # Fetch results if status == "ready": - data = await scraper.prompt_fetch_async(job.snapshot_id) + data = await scraper.prompt_fetch(job.snapshot_id) print(" โœ… Fetched data successfully") if data and len(data) > 0: print(f" - Answer: {data[0].get('answer_text', 'N/A')[:100]}...") diff --git a/tests/enes/facebook.py b/tests/enes/facebook.py index 3e0a89e..21a2578 100644 --- a/tests/enes/facebook.py +++ b/tests/enes/facebook.py @@ -31,7 +31,7 @@ async def test_facebook_posts_by_profile(): print("๐Ÿ“‹ Parameters: num_of_posts=5") try: - result = await scraper.posts_by_profile_async( + result = await scraper.posts_by_profile( url="https://www.facebook.com/facebook", num_of_posts=5, timeout=240 ) @@ -88,7 +88,7 @@ async def test_facebook_posts_by_group(): print("๐Ÿ“‹ Parameters: num_of_posts=5") try: - result = await scraper.posts_by_group_async( + result = await scraper.posts_by_group( url="https://www.facebook.com/groups/example", num_of_posts=5, timeout=240 ) @@ -141,7 +141,7 @@ async def test_facebook_posts_by_url(): print("๐Ÿ“ Post URL: https://www.facebook.com/facebook/posts/123456789") try: - result = await scraper.posts_by_url_async( + result = await scraper.posts_by_url( url="https://www.facebook.com/facebook/posts/123456789", timeout=240 ) @@ -193,7 +193,7 @@ async def test_facebook_comments(): print("๐Ÿ“‹ Parameters: num_of_comments=10") try: - result = await scraper.comments_async( + result = await scraper.comments( url="https://www.facebook.com/facebook/posts/123456789", num_of_comments=10, timeout=240, @@ -250,7 +250,7 @@ async def test_facebook_reels(): print("๐Ÿ“‹ Parameters: num_of_posts=5") try: - result = await scraper.reels_async( + result = await scraper.reels( url="https://www.facebook.com/facebook", num_of_posts=5, timeout=240 ) diff --git a/tests/enes/instagram.py b/tests/enes/instagram.py index d79286b..91bb749 100644 --- a/tests/enes/instagram.py +++ b/tests/enes/instagram.py @@ -30,7 +30,7 @@ async def test_instagram_profiles(): print("๐Ÿ“ Profile URL: https://www.instagram.com/instagram") try: - result = await scraper.profiles_async( + result = await scraper.profiles( url="https://www.instagram.com/instagram", timeout=180 ) @@ -78,7 +78,7 @@ async def test_instagram_posts(): print("๐Ÿ“ Post URL: https://www.instagram.com/p/C9z9z9z9z9z") try: - result = await scraper.posts_async( + result = await scraper.posts( url="https://www.instagram.com/p/C9z9z9z9z9z", timeout=180 ) @@ -124,7 +124,7 @@ async def test_instagram_reels(): print("๐Ÿ“ Reel URL: https://www.instagram.com/reel/ABC123") try: - result = await scraper.reels_async( + result = await scraper.reels( url="https://www.instagram.com/reel/ABC123", timeout=180 ) @@ -170,7 +170,7 @@ async def test_instagram_search_posts(): print("๐Ÿ“‹ Search: profile url, num_of_posts=10") try: - result = await scraper.posts_async( + result = await scraper.posts( url="https://www.instagram.com/instagram", num_of_posts=10, timeout=180 ) diff --git a/tests/enes/linkedin.py b/tests/enes/linkedin.py index 5863287..908e601 100644 --- a/tests/enes/linkedin.py +++ b/tests/enes/linkedin.py @@ -30,7 +30,7 @@ async def test_linkedin_profiles(): print("๐Ÿ“ Profile URL: https://www.linkedin.com/in/williamhgates") try: - result = await scraper.profiles_async( + result = await scraper.profiles( url="https://www.linkedin.com/in/williamhgates", timeout=180 ) @@ -76,7 +76,7 @@ async def test_linkedin_companies(): print("๐Ÿ“ Company URL: https://www.linkedin.com/company/microsoft") try: - result = await scraper.companies_async( + result = await scraper.companies( url="https://www.linkedin.com/company/microsoft", timeout=180 ) @@ -122,7 +122,7 @@ async def test_linkedin_jobs(): print("๐Ÿ“ Job URL: https://www.linkedin.com/jobs/view/3787241244") try: - result = await scraper.jobs_async( + result = await scraper.jobs( url="https://www.linkedin.com/jobs/view/3787241244", timeout=180 ) @@ -168,7 +168,7 @@ async def test_linkedin_search_jobs(): print("๐Ÿ“‹ Search: keyword='python developer', location='New York'") try: - result = await scraper.jobs_async( + result = await scraper.jobs( keyword="python developer", location="New York", timeout=180 ) diff --git a/tests/enes/serp.py b/tests/enes/serp.py index 8055a82..4226e05 100644 --- a/tests/enes/serp.py +++ b/tests/enes/serp.py @@ -31,7 +31,7 @@ async def test_serp_raw_html_issue(): try: # Make the search request - result = await client.search.google_async(query="pizza") + result = await client.search.google(query="pizza") print("\nโœ… API call succeeded") print(f"โฑ๏ธ Elapsed: {result.elapsed_ms():.2f}ms" if result.elapsed_ms() else "") @@ -77,7 +77,7 @@ def capture_raw(data): service.data_normalizer.normalize = capture_raw # Make the request - await service.search_async(query="pizza", zone=client.serp_zone) + await service.search(query="pizza", zone=client.serp_zone) if raw_response: print("\n๐Ÿ“ฆ Raw API response structure:") diff --git a/tests/enes/web_unlocker.py b/tests/enes/web_unlocker.py index 1a9ea1e..a9a72c4 100644 --- a/tests/enes/web_unlocker.py +++ b/tests/enes/web_unlocker.py @@ -33,7 +33,7 @@ async def test_web_unlocker_single_url(): print("๐Ÿ“ URL: https://httpbin.org/html") try: - result = await client.scrape.generic.url_async( + result = await client.scrape_url( url="https://httpbin.org/html", response_format="raw" ) @@ -86,7 +86,7 @@ async def test_web_unlocker_json_format(): print("๐Ÿ“ URL: https://httpbin.org/json") try: - result = await client.scrape.generic.url_async( + result = await client.scrape_url( url="https://httpbin.org/json", response_format="json" ) @@ -139,7 +139,7 @@ async def test_web_unlocker_multiple_urls(): print(f"๐Ÿ“‹ URLs: {len(urls)} URLs") try: - results = await client.scrape.generic.url_async(url=urls, response_format="raw") + results = await client.scrape_url(url=urls, response_format="raw") print("\nโœ… API call succeeded") print(f"๐Ÿ“Š Got {len(results)} results") @@ -191,7 +191,7 @@ async def test_web_unlocker_with_country(): print("๐ŸŒ Country: US") try: - result = await client.scrape.generic.url_async( + result = await client.scrape_url( url="https://httpbin.org/headers", country="us", response_format="raw" ) diff --git a/tests/enes/zones/auto_zone.py b/tests/enes/zones/auto_zone.py index 43c6f30..a60d0b9 100644 --- a/tests/enes/zones/auto_zone.py +++ b/tests/enes/zones/auto_zone.py @@ -93,7 +93,7 @@ def test_auto_zone_creation(): async def create_web_unlocker(): async with client: # This should trigger zone creation - result = await client.scrape_url_async( + result = await client.scrape_url( url="https://example.com", zone=client.web_unlocker_zone ) return result @@ -119,7 +119,7 @@ async def create_web_unlocker(): async def create_serp(): async with client: # This should trigger SERP zone creation - result = await client.search.google_async(query="test", zone=client.serp_zone) + result = await client.search.google(query="test", zone=client.serp_zone) return result asyncio.run(create_serp()) diff --git a/tests/enes/zones/auto_zones.py b/tests/enes/zones/auto_zones.py index eda43a0..c439793 100644 --- a/tests/enes/zones/auto_zones.py +++ b/tests/enes/zones/auto_zones.py @@ -92,7 +92,7 @@ async def attempt_zone_creations(): print(f"\n1๏ธโƒฃ Attempting to create Web Unlocker zone: {client.web_unlocker_zone}") try: async with client: - await client.scrape_url_async( + await client.scrape_url( url="https://example.com", zone=client.web_unlocker_zone ) print(" โœ… Zone operation completed") @@ -114,7 +114,7 @@ async def attempt_zone_creations(): print(f"\n2๏ธโƒฃ Attempting to create SERP zone: {client.serp_zone}") try: async with client: - await client.search.google_async(query="test", zone=client.serp_zone) + await client.search.google(query="test", zone=client.serp_zone) print(" โœ… Zone operation completed") results.append(("SERP", client.serp_zone, True)) except Exception as e: diff --git a/tests/enes/zones/crud_zones.py b/tests/enes/zones/crud_zones.py index fbcd416..f244f79 100644 --- a/tests/enes/zones/crud_zones.py +++ b/tests/enes/zones/crud_zones.py @@ -68,11 +68,11 @@ async def test_create_zones(self) -> bool: # Trigger zone creation try: if zone_type == "unblocker": - await temp_client.scrape_url_async( + await temp_client.scrape_url( url="https://example.com", zone=zone_name ) else: # serp - await temp_client.search.google_async(query="test", zone=zone_name) + await temp_client.search.google(query="test", zone=zone_name) except Exception: # Zone might be created even if operation fails pass diff --git a/tests/enes/zones/delete_zone.py b/tests/enes/zones/delete_zone.py index f586160..89c7679 100644 --- a/tests/enes/zones/delete_zone.py +++ b/tests/enes/zones/delete_zone.py @@ -61,7 +61,7 @@ async def demo_delete_zone(): async with test_client: # Trigger zone creation try: - await test_client.scrape_url_async( + await test_client.scrape_url( url="https://example.com", zone=test_zone_name ) except Exception as e: diff --git a/tests/enes/zones/test_cache.py b/tests/enes/zones/test_cache.py index fa82ef6..356c401 100644 --- a/tests/enes/zones/test_cache.py +++ b/tests/enes/zones/test_cache.py @@ -52,7 +52,7 @@ async def test_caching_issue(): ) async with temp: try: - await temp.scrape_url_async("https://example.com", zone=test_zone) + await temp.scrape_url("https://example.com", zone=test_zone) except Exception: pass print(f" Zone '{test_zone}' created") diff --git a/tests/integration/test_client_integration.py b/tests/integration/test_client_integration.py index 6f191b4..44dc4a7 100644 --- a/tests/integration/test_client_integration.py +++ b/tests/integration/test_client_integration.py @@ -14,7 +14,7 @@ except ImportError: pass -from brightdata import BrightDataClient +from brightdata import BrightDataClient, SyncBrightDataClient from brightdata.exceptions import AuthenticationError @@ -29,10 +29,17 @@ def api_token(): @pytest.fixture def client(api_token): - """Create client instance for testing.""" + """Create async client instance for testing (must be used with async context).""" return BrightDataClient(token=api_token) +@pytest.fixture +def sync_client(api_token): + """Create sync client instance for testing.""" + with SyncBrightDataClient(token=api_token) as client: + yield client + + @pytest.fixture async def async_client(api_token): """Create async client instance for testing.""" @@ -61,9 +68,9 @@ async def test_connection_with_invalid_token(self): is_valid = await client.test_connection() assert is_valid is False - def test_connection_sync_with_valid_token(self, client): - """Test synchronous connection test.""" - is_valid = client.test_connection_sync() + def test_connection_sync_with_valid_token(self, sync_client): + """Test synchronous connection test using SyncBrightDataClient.""" + is_valid = sync_client.test_connection() assert is_valid is True @@ -112,9 +119,9 @@ async def test_get_account_info_with_invalid_token(self): assert "Invalid token" in str(exc_info.value) or "401" in str(exc_info.value) - def test_get_account_info_sync(self, client): - """Test synchronous account info retrieval.""" - info = client.get_account_info_sync() + def test_get_account_info_sync(self, sync_client): + """Test synchronous account info retrieval using SyncBrightDataClient.""" + info = sync_client.get_account_info() assert isinstance(info, dict) assert "zones" in info @@ -153,10 +160,15 @@ def test_client_with_validate_token_true_and_valid_token(self, api_token): client = BrightDataClient(token=api_token, validate_token=True) assert client.token == api_token - def test_client_with_validate_token_true_and_invalid_token(self): - """Test client raises error on init if token is invalid and validation enabled.""" + @pytest.mark.asyncio + async def test_client_with_validate_token_true_and_invalid_token(self): + """Test client raises error on __aenter__ if token is invalid and validation enabled.""" + client = BrightDataClient( + token="invalid_token_123456789", validate_token=True, auto_create_zones=False + ) with pytest.raises(AuthenticationError): - BrightDataClient(token="invalid_token_123456789", validate_token=True) + async with client: + pass # Should not reach here def test_client_with_validate_token_false_accepts_any_token(self): """Test client accepts any token format when validation disabled.""" @@ -172,15 +184,15 @@ class TestLegacyAPICompatibility: async def test_scrape_url_async_works(self, async_client): """Test legacy scrape_url_async method works.""" # Simple test URL - result = await async_client.scrape_url_async(url="https://httpbin.org/html") + result = await async_client.scrape_url(url="https://httpbin.org/html") assert result is not None assert hasattr(result, "success") assert hasattr(result, "data") - def test_scrape_url_sync_works(self, client): - """Test legacy scrape_url method works synchronously.""" - result = client.scrape_url(url="https://httpbin.org/html") + def test_scrape_url_sync_works(self, sync_client): + """Test scrape_url method works synchronously using SyncBrightDataClient.""" + result = sync_client.scrape_url(url="https://httpbin.org/html") assert result is not None assert hasattr(result, "success") @@ -201,9 +213,10 @@ async def test_connection_test_returns_false_on_network_error(self): assert is_valid is False def test_sync_connection_test_returns_false_on_error(self): - """Test sync connection test returns False on errors.""" - client = BrightDataClient(token="test_token_123456789") - - # Should return False, not raise exception - is_valid = client.test_connection_sync() - assert is_valid is False + """Test sync connection test returns False on errors using SyncBrightDataClient.""" + with SyncBrightDataClient( + token="test_token_123456789", auto_create_zones=False + ) as client: + # Should return False, not raise exception + is_valid = client.test_connection() + assert is_valid is False diff --git a/tests/readme.py b/tests/readme.py index f47ab89..8b0a183 100644 --- a/tests/readme.py +++ b/tests/readme.py @@ -104,13 +104,13 @@ def test_simple_web_scraping(self, client): Line: 101-118 """ # From README: - # result = client.scrape.generic.url("https://example.com") + # result = client.scrape_url("https://example.com") # if result.success: # print(f"Success: {result.success}") # print(f"Data: {result.data[:200]}...") # print(f"Time: {result.elapsed_ms():.2f}ms") - result = client.scrape.generic.url("https://example.com") + result = client.scrape_url("https://example.com") assert result is not None, "Result is None" assert hasattr(result, "success"), "Result missing 'success' attribute" @@ -673,7 +673,7 @@ async def test_async_multiple_urls(self, api_token): # From README: # async def scrape_multiple(): # async with BrightDataClient() as client: - # results = await client.scrape.generic.url_async([ + # results = await client.scrape_url([ # "https://example1.com", # "https://example2.com", # "https://example3.com" @@ -682,7 +682,7 @@ async def test_async_multiple_urls(self, api_token): # print(f"Success: {result.success}") async with BrightDataClient(token=api_token) as client: - results = await client.scrape.generic.url_async( + results = await client.scrape_url( ["https://httpbin.org/html", "https://example.com", "https://httpbin.org/json"] ) @@ -771,7 +771,7 @@ def test_result_object_attributes(self, client): # result.elapsed_ms(), result.get_timing_breakdown() # result.to_dict(), result.to_json(indent=2) - result = client.scrape.generic.url("https://example.com") + result = client.scrape_url("https://example.com") # Verify all attributes assert hasattr(result, "success"), "Missing 'success' attribute" @@ -828,13 +828,13 @@ async def test_async_method_usage(self, api_token): # From README: # async def scrape_profiles(): # async with BrightDataClient() as client: - # result = await client.scrape.linkedin.profiles_async( + # result = await client.scrape.linkedin.profiles( # url="https://linkedin.com/in/johndoe", # timeout=300 # ) async with BrightDataClient(token=api_token) as client: - result = await client.scrape.linkedin.profiles_async( + result = await client.scrape.linkedin.profiles( url="https://linkedin.com/in/williamhgates", timeout=300 )