From 36d4fc9791b389127c6b7d2704d1a1c2f19ff3b9 Mon Sep 17 00:00:00 2001 From: Vitor Zucher Date: Thu, 4 Dec 2025 09:41:31 -0300 Subject: [PATCH 1/4] Fixed Amazon Search Dataset ID and broken links in README --- README.md | 47 ++++-------------------- src/brightdata/scrapers/amazon/search.py | 45 +++++++++++++++-------- 2 files changed, 36 insertions(+), 56 deletions(-) diff --git a/README.md b/README.md index 1dbf8c3..3e05593 100644 --- a/README.md +++ b/README.md @@ -83,11 +83,11 @@ Modern async-first Python SDK for [Bright Data](https://brightdata.com) APIs wit Perfect for data scientists! Interactive tutorials with examples: -1. **[01_quickstart.ipynb](notebooks/01_quickstart.ipynb)** - Get started in 5 minutes [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/brightdata/sdk-python/blob/master/notebooks/01_quickstart.ipynb) -2. **[02_pandas_integration.ipynb](notebooks/02_pandas_integration.ipynb)** - Work with DataFrames [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/brightdata/sdk-python/blob/master/notebooks/02_pandas_integration.ipynb) -3. **[03_amazon_scraping.ipynb](notebooks/03_amazon_scraping.ipynb)** - Amazon deep dive [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/brightdata/sdk-python/blob/master/notebooks/03_amazon_scraping.ipynb) -4. **[04_linkedin_jobs.ipynb](notebooks/04_linkedin_jobs.ipynb)** - Job market analysis [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/brightdata/sdk-python/blob/master/notebooks/04_linkedin_jobs.ipynb) -5. **[05_batch_processing.ipynb](notebooks/05_batch_processing.ipynb)** - Scale to 1000s of URLs [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/brightdata/sdk-python/blob/master/notebooks/05_batch_processing.ipynb) +1. **[01_quickstart.ipynb](notebooks/01_quickstart.ipynb)** - Get started in 5 minutes [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/brightdata/sdk-python/blob/main/notebooks/01_quickstart.ipynb) +2. **[02_pandas_integration.ipynb](notebooks/02_pandas_integration.ipynb)** - Work with DataFrames [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/brightdata/sdk-python/blob/main/notebooks/02_pandas_integration.ipynb) +3. **[03_amazon_scraping.ipynb](notebooks/03_amazon_scraping.ipynb)** - Amazon deep dive [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/brightdata/sdk-python/blob/main/notebooks/03_amazon_scraping.ipynb) +4. **[04_linkedin_jobs.ipynb](notebooks/04_linkedin_jobs.ipynb)** - Job market analysis [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/brightdata/sdk-python/blob/main/notebooks/04_linkedin_jobs.ipynb) +5. **[05_batch_processing.ipynb](notebooks/05_batch_processing.ipynb)** - Scale to 1000s of URLs [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/brightdata/sdk-python/blob/main/notebooks/05_batch_processing.ipynb) --- @@ -1078,10 +1078,8 @@ pytest tests/ --cov=brightdata --cov-report=html - [All examples →](examples/) ### Documentation -- [Quick Start Guide](docs/quickstart.md) -- [Architecture Overview](docs/architecture.md) - [API Reference](docs/api-reference/) -- [Contributing Guide](docs/contributing.md) +- [Contributing Guidelines](https://github.com/brightdata/sdk-python/blob/main/CONTRIBUTING.md) (See upstream repo) --- @@ -1140,7 +1138,7 @@ pip install -e . ## šŸ¤ Contributing -Contributions are welcome! Please see [CONTRIBUTING.md](docs/contributing.md) for guidelines. +Contributions are welcome! Check the [GitHub repository](https://github.com/brightdata/sdk-python) for contribution guidelines. ### Development Setup @@ -1269,37 +1267,6 @@ Run the included demo to explore the SDK interactively: ```bash python demo_sdk.py ``` - ---- - -## šŸŽÆ Roadmap - -### āœ… Completed -- [x] Core client with authentication -- [x] Web Unlocker service -- [x] Platform scrapers (Amazon, LinkedIn, ChatGPT, Facebook, Instagram) -- [x] SERP API (Google, Bing, Yandex) -- [x] Comprehensive test suite (502+ tests) -- [x] .env file support via python-dotenv -- [x] SSL error handling with helpful guidance -- [x] Centralized constants module -- [x] Function-level monitoring -- [x] **Dataclass payloads with validation** -- [x] **Jupyter notebooks for data scientists** -- [x] **CLI tool (brightdata command)** -- [x] **Pandas integration examples** -- [x] **Single shared AsyncEngine (8x efficiency)** - -### 🚧 In Progress -- [ ] Browser automation API -- [ ] Web crawler API - -### šŸ”® Future -- [ ] Additional platforms (Reddit, Twitter/X, TikTok, YouTube) -- [ ] Real-time data streaming -- [ ] Advanced caching strategies -- [ ] Prometheus metrics export - --- ## šŸ™ Acknowledgments diff --git a/src/brightdata/scrapers/amazon/search.py b/src/brightdata/scrapers/amazon/search.py index b2154e8..5af35ba 100644 --- a/src/brightdata/scrapers/amazon/search.py +++ b/src/brightdata/scrapers/amazon/search.py @@ -35,7 +35,7 @@ class AmazonSearchScraper: """ # Amazon dataset IDs - DATASET_ID_PRODUCTS_SEARCH = "gd_l7q7dkf244hwjntr0" # Amazon Products with search + DATASET_ID_PRODUCTS_SEARCH = "gd_lwdb4vjm1ehb499uxs" # Amazon Products Search (15.84M records) def __init__(self, bearer_token: str, engine: Optional[AsyncEngine] = None): """ @@ -125,26 +125,39 @@ async def products_async( conditions = self._normalize_param(condition, batch_size) countries = self._normalize_param(country, batch_size) - # Build payload - Amazon API requires URLs - # If keyword provided, build Amazon search URL internally + # Build payload - Amazon Products Search dataset expects keyword field payload = [] for i in range(batch_size): + item = {} + # If URL provided directly, use it if urls and i < len(urls): - item = {"url": urls[i]} + item["url"] = urls[i] + # Extract keyword from URL if possible for the keyword field + if "k=" in urls[i]: + import urllib.parse + parsed = urllib.parse.urlparse(urls[i]) + params = urllib.parse.parse_qs(parsed.query) + item["keyword"] = params.get("k", [""])[0] + else: + item["keyword"] = "" else: - # Build Amazon search URL from parameters - search_url = self._build_amazon_search_url( - keyword=keywords[i] if keywords and i < len(keywords) else None, - category=categories[i] if categories and i < len(categories) else None, - min_price=min_prices[i] if min_prices and i < len(min_prices) else None, - max_price=max_prices[i] if max_prices and i < len(max_prices) else None, - condition=conditions[i] if conditions and i < len(conditions) else None, - prime_eligible=prime_eligible, - country=countries[i] if countries and i < len(countries) else None, - ) - item = {"url": search_url} - + # Send keyword directly (dataset expects this field) + item["keyword"] = keywords[i] if keywords and i < len(keywords) else "" + + # Optionally build URL for additional context + if item["keyword"]: + search_url = self._build_amazon_search_url( + keyword=item["keyword"], + category=categories[i] if categories and i < len(categories) else None, + min_price=min_prices[i] if min_prices and i < len(min_prices) else None, + max_price=max_prices[i] if max_prices and i < len(max_prices) else None, + condition=conditions[i] if conditions and i < len(conditions) else None, + prime_eligible=prime_eligible, + country=countries[i] if countries and i < len(countries) else None, + ) + item["url"] = search_url + payload.append(item) return await self._execute_search( From 97a2ffa3bb04fb6fac3949940f9c8c67896862c7 Mon Sep 17 00:00:00 2001 From: Vitor Zucher Date: Thu, 4 Dec 2025 09:46:30 -0300 Subject: [PATCH 2/4] style: Format amazon/search.py with black (fix CI lint) --- src/brightdata/scrapers/amazon/search.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/brightdata/scrapers/amazon/search.py b/src/brightdata/scrapers/amazon/search.py index 5af35ba..802b7d9 100644 --- a/src/brightdata/scrapers/amazon/search.py +++ b/src/brightdata/scrapers/amazon/search.py @@ -129,13 +129,14 @@ async def products_async( payload = [] for i in range(batch_size): item = {} - + # If URL provided directly, use it if urls and i < len(urls): item["url"] = urls[i] # Extract keyword from URL if possible for the keyword field if "k=" in urls[i]: import urllib.parse + parsed = urllib.parse.urlparse(urls[i]) params = urllib.parse.parse_qs(parsed.query) item["keyword"] = params.get("k", [""])[0] @@ -144,7 +145,7 @@ async def products_async( else: # Send keyword directly (dataset expects this field) item["keyword"] = keywords[i] if keywords and i < len(keywords) else "" - + # Optionally build URL for additional context if item["keyword"]: search_url = self._build_amazon_search_url( @@ -157,7 +158,7 @@ async def products_async( country=countries[i] if countries and i < len(countries) else None, ) item["url"] = search_url - + payload.append(item) return await self._execute_search( From ea894a8e8b4c90e6287d9adbaf09f6e5d2e225b3 Mon Sep 17 00:00:00 2001 From: Vitor Zucher Date: Thu, 4 Dec 2025 10:36:37 -0300 Subject: [PATCH 3/4] Improved sync/async documentation and added _run for SERP --- README.md | 209 ++++++++++++++++++++++++--- src/brightdata/api/search_service.py | 15 +- 2 files changed, 202 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 3e05593..9d08843 100644 --- a/README.md +++ b/README.md @@ -149,9 +149,9 @@ client = BrightDataClient() result = client.scrape.generic.url("https://example.com") if result.success: - print(f"Success: {result.success}") - print(f"Data: {result.data[:200]}...") - print(f"Time: {result.elapsed_ms():.2f}ms") +print(f"Success: {result.success}") +print(f"Data: {result.data[:200]}...") +print(f"Time: {result.elapsed_ms():.2f}ms") else: print(f"Error: {result.error}") ``` @@ -460,13 +460,14 @@ asyncio.run(scrape_multiple()) ## šŸ†• What's New in v2 2.0.0 ### šŸ†• **Latest Updates (December 2025)** -- āœ… **Amazon Search API** - NEW parameter-based product discovery +- āœ… **Amazon Search API** - NEW parameter-based product discovery with correct dataset - āœ… **LinkedIn Job Search Fixed** - Now builds URLs from keywords internally - āœ… **Trigger Interface** - Manual trigger/poll/fetch control for all platforms +- āœ… **29 Sync Wrapper Fixes** - All sync methods work (scrapers + SERP API) +- āœ… **Batch Operations Fixed** - Returns List[ScrapeResult] correctly - āœ… **Auto-Create Zones** - Now enabled by default (was opt-in) - āœ… **Improved Zone Names** - `sdk_unlocker`, `sdk_serp`, `sdk_browser` -- āœ… **26 Sync Wrapper Fixes** - All platform scrapers now work without context managers -- āœ… **Zone Manager Tests Fixed** - All 22 tests passing +- āœ… **Full Sync/Async Examples** - README now shows both patterns for all features ### šŸŽ“ **For Data Scientists** - āœ… **5 Jupyter Notebooks** - Complete interactive tutorials @@ -924,29 +925,199 @@ result = client.search.linkedin.jobs( ) ``` -### Sync vs Async Methods +### Sync vs Async Examples - Full Coverage + +All SDK methods support **both sync and async** patterns. Choose based on your needs: + +#### **Amazon Products** ```python -# Sync wrapper - for simple scripts (blocks until complete) -result = client.scrape.linkedin.profiles( - url="https://linkedin.com/in/johndoe", - timeout=300 # Max wait time in seconds -) +# SYNC - Simple scripts +result = client.scrape.amazon.products(url="https://amazon.com/dp/B123") -# Async method - for concurrent operations (requires async context) +# ASYNC - Concurrent operations import asyncio -async def scrape_profiles(): +async def scrape_amazon(): + async with BrightDataClient() as client: + result = await client.scrape.amazon.products_async(url="https://amazon.com/dp/B123") + return result + +result = asyncio.run(scrape_amazon()) +``` + +#### **Amazon Search** + +```python +# SYNC - Simple keyword search +result = client.search.amazon.products(keyword="laptop", prime_eligible=True) + +# ASYNC - Batch keyword searches +async def search_amazon(): + async with BrightDataClient() as client: + result = await client.search.amazon.products_async( + keyword="laptop", + min_price=50000, + max_price=200000, + prime_eligible=True + ) + return result + +result = asyncio.run(search_amazon()) +``` + +#### **LinkedIn Scraping** + +```python +# SYNC - Single profile +result = client.scrape.linkedin.profiles(url="https://linkedin.com/in/johndoe") + +# ASYNC - Multiple profiles concurrently +async def scrape_linkedin(): async with BrightDataClient() as client: - result = await client.scrape.linkedin.profiles_async( - url="https://linkedin.com/in/johndoe", - timeout=300 + urls = ["https://linkedin.com/in/person1", "https://linkedin.com/in/person2"] + results = await client.scrape.linkedin.profiles_async(url=urls) + return results + +results = asyncio.run(scrape_linkedin()) +``` + +#### **LinkedIn Job Search** + +```python +# SYNC - Simple job search +result = client.search.linkedin.jobs(keyword="python", location="NYC", remote=True) + +# ASYNC - Advanced search with filters +async def search_jobs(): + async with BrightDataClient() as client: + result = await client.search.linkedin.jobs_async( + keyword="python developer", + location="New York", + experienceLevel="mid", + jobType="full-time", + remote=True ) return result -result = asyncio.run(scrape_profiles()) +result = asyncio.run(search_jobs()) ``` +#### **SERP API (Google, Bing, Yandex)** + +```python +# SYNC - Quick Google search +result = client.search.google(query="python tutorial", location="United States") + +# ASYNC - Multiple search engines concurrently +async def search_all_engines(): + async with BrightDataClient() as client: + google = await client.search.google_async(query="python", num_results=10) + bing = await client.search.bing_async(query="python", num_results=10) + yandex = await client.search.yandex_async(query="python", num_results=10) + return google, bing, yandex + +results = asyncio.run(search_all_engines()) +``` + +#### **Facebook Scraping** + +```python +# SYNC - Single profile posts +result = client.scrape.facebook.posts_by_profile( + url="https://facebook.com/profile", + num_of_posts=10 +) + +# ASYNC - Multiple sources +async def scrape_facebook(): + async with BrightDataClient() as client: + profile_posts = await client.scrape.facebook.posts_by_profile_async( + url="https://facebook.com/zuck", + num_of_posts=10 + ) + group_posts = await client.scrape.facebook.posts_by_group_async( + url="https://facebook.com/groups/programming", + num_of_posts=10 + ) + return profile_posts, group_posts + +results = asyncio.run(scrape_facebook()) +``` + +#### **Instagram Scraping** + +```python +# SYNC - Single profile +result = client.scrape.instagram.profiles(url="https://instagram.com/instagram") + +# ASYNC - Profile + posts +async def scrape_instagram(): + async with BrightDataClient() as client: + profile = await client.scrape.instagram.profiles_async( + url="https://instagram.com/instagram" + ) + posts = await client.scrape.instagram.posts_async( + url="https://instagram.com/p/ABC123" + ) + return profile, posts + +results = asyncio.run(scrape_instagram()) +``` + +#### **ChatGPT** + +```python +# SYNC - Single prompt +result = client.scrape.chatgpt.prompt(prompt="Explain Python", web_search=True) + +# ASYNC - Batch prompts +async def ask_chatgpt(): + async with BrightDataClient() as client: + result = await client.scrape.chatgpt.prompts_async( + prompts=["What is Python?", "What is JavaScript?"], + web_searches=[False, True] + ) + return result + +result = asyncio.run(ask_chatgpt()) +``` + +#### **Generic Web Scraping** + +```python +# SYNC - Single URL +result = client.scrape.generic.url(url="https://example.com") + +# ASYNC - Concurrent scraping +async def scrape_multiple(): + async with BrightDataClient() as client: + results = await client.scrape.generic.url_async([ + "https://example1.com", + "https://example2.com", + "https://example3.com" + ]) + return results + +results = asyncio.run(scrape_multiple()) +``` + +--- + +### **When to Use Sync vs Async** + +**Use Sync When:** +- āœ… Simple scripts or notebooks +- āœ… Single operations at a time +- āœ… Learning or prototyping +- āœ… Sequential workflows + +**Use Async When:** +- āœ… Scraping multiple URLs concurrently +- āœ… Combining multiple API calls +- āœ… Production applications +- āœ… Performance-critical operations + **Note:** Sync wrappers (e.g., `profiles()`) internally use `asyncio.run()` and cannot be called from within an existing async context. Use `*_async` methods when you're already in an async function. ### SSL Certificate Error Handling @@ -1236,7 +1407,7 @@ if client.test_connection_sync(): ) if fb_posts.success: - print(f"Scraped {len(fb_posts.data)} Facebook posts") + print(f"Scraped {len(fb_posts.data)} Facebook posts") # Scrape Instagram profile ig_profile = client.scrape.instagram.profiles( diff --git a/src/brightdata/api/search_service.py b/src/brightdata/api/search_service.py index e3e54ae..0bbb930 100644 --- a/src/brightdata/api/search_service.py +++ b/src/brightdata/api/search_service.py @@ -116,7 +116,10 @@ def google( ... location="United States" ... ) """ - return asyncio.run(self.google_async(query, **kwargs)) + async def _run(): + async with self._client.engine: + return await self.google_async(query, **kwargs) + return asyncio.run(_run()) async def bing_async( self, @@ -148,7 +151,10 @@ async def bing_async( def bing(self, query: Union[str, List[str]], **kwargs): """Search Bing synchronously.""" - return asyncio.run(self.bing_async(query, **kwargs)) + async def _run(): + async with self._client.engine: + return await self.bing_async(query, **kwargs) + return asyncio.run(_run()) async def yandex_async( self, @@ -180,7 +186,10 @@ async def yandex_async( def yandex(self, query: Union[str, List[str]], **kwargs): """Search Yandex synchronously.""" - return asyncio.run(self.yandex_async(query, **kwargs)) + async def _run(): + async with self._client.engine: + return await self.yandex_async(query, **kwargs) + return asyncio.run(_run()) @property def amazon(self): From 72fa443192fb03c5ce3c57769810032ee7f2d2a7 Mon Sep 17 00:00:00 2001 From: Vitor Zucher Date: Thu, 4 Dec 2025 11:16:33 -0300 Subject: [PATCH 4/4] fix: Complete sync wrapper coverage and Amazon Search dataset fix --- .gitignore | 2 +- src/brightdata/api/base.py | 8 +- src/brightdata/api/scrape_service.py | 7 +- src/brightdata/api/search_service.py | 6 + tests/run_all.py | 185 +++++++++++++++++++++++++++ tests/test_cli.sh | 175 +++++++++++++++++++++++++ 6 files changed, 380 insertions(+), 3 deletions(-) create mode 100644 tests/run_all.py create mode 100755 tests/test_cli.sh diff --git a/.gitignore b/.gitignore index b990093..8dc3c14 100644 --- a/.gitignore +++ b/.gitignore @@ -261,4 +261,4 @@ Thumbs.db # Project specific *.log .cache/ - +probe diff --git a/src/brightdata/api/base.py b/src/brightdata/api/base.py index 6bd4251..f99fa15 100644 --- a/src/brightdata/api/base.py +++ b/src/brightdata/api/base.py @@ -38,6 +38,7 @@ def _execute_sync(self, *args: Any, **kwargs: Any) -> Any: Execute API operation synchronously. Wraps async method using asyncio.run() for sync compatibility. + Properly manages engine context. """ try: asyncio.get_running_loop() @@ -45,4 +46,9 @@ def _execute_sync(self, *args: Any, **kwargs: Any) -> Any: "Cannot call sync method from async context. Use async method instead." ) except RuntimeError: - return asyncio.run(self._execute_async(*args, **kwargs)) + + async def _run(): + async with self.engine: + return await self._execute_async(*args, **kwargs) + + return asyncio.run(_run()) diff --git a/src/brightdata/api/scrape_service.py b/src/brightdata/api/scrape_service.py index 30aacc4..7b367be 100644 --- a/src/brightdata/api/scrape_service.py +++ b/src/brightdata/api/scrape_service.py @@ -214,4 +214,9 @@ async def url_async( def url(self, *args, **kwargs) -> Union[ScrapeResult, List[ScrapeResult]]: """Scrape URL(s) synchronously.""" - return asyncio.run(self.url_async(*args, **kwargs)) + + async def _run(): + async with self._client.engine: + return await self.url_async(*args, **kwargs) + + return asyncio.run(_run()) diff --git a/src/brightdata/api/search_service.py b/src/brightdata/api/search_service.py index 0bbb930..d9d15ae 100644 --- a/src/brightdata/api/search_service.py +++ b/src/brightdata/api/search_service.py @@ -116,9 +116,11 @@ def google( ... location="United States" ... ) """ + async def _run(): async with self._client.engine: return await self.google_async(query, **kwargs) + return asyncio.run(_run()) async def bing_async( @@ -151,9 +153,11 @@ async def bing_async( def bing(self, query: Union[str, List[str]], **kwargs): """Search Bing synchronously.""" + async def _run(): async with self._client.engine: return await self.bing_async(query, **kwargs) + return asyncio.run(_run()) async def yandex_async( @@ -186,9 +190,11 @@ async def yandex_async( def yandex(self, query: Union[str, List[str]], **kwargs): """Search Yandex synchronously.""" + async def _run(): async with self._client.engine: return await self.yandex_async(query, **kwargs) + return asyncio.run(_run()) @property diff --git a/tests/run_all.py b/tests/run_all.py new file mode 100644 index 0000000..789f7e9 --- /dev/null +++ b/tests/run_all.py @@ -0,0 +1,185 @@ +#!/usr/bin/env python3 +""" +Comprehensive test runner - validates EVERYTHING +Saves all outputs to probe/ directory for inspection +""" + +import subprocess +import json +from pathlib import Path +from datetime import datetime + +# Create probe directory structure matching tests/ structure +PROBE_DIR = Path("probe") +PROBE_DIR.mkdir(exist_ok=True) +(PROBE_DIR / "unit").mkdir(exist_ok=True) +(PROBE_DIR / "e2e").mkdir(exist_ok=True) +(PROBE_DIR / "integration").mkdir(exist_ok=True) +(PROBE_DIR / "enes").mkdir(exist_ok=True) +(PROBE_DIR / "root").mkdir(exist_ok=True) + +# Test suites to run (matches tests/ directory structure) +test_suites = { + "root_readme": "tests/readme.py", # Root level test + "unit": "tests/unit/", + "e2e": "tests/e2e/", + "integration": "tests/integration/", + "enes": "tests/enes/", +} + +# Linting checks +lint_checks = { + "black": ["black", "--check", "src", "tests"], + "ruff": ["ruff", "check", "src/", "tests/"], +} + +timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") +results = {"timestamp": timestamp, "test_suites": {}, "lint_checks": {}, "summary": {}} + +print("=" * 80) +print("COMPREHENSIVE SDK VALIDATION") +print("=" * 80) +print(f"Timestamp: {timestamp}") +print(f"Output directory: {PROBE_DIR.absolute()}") +print("=" * 80) + +# Run linting checks +print("\nšŸ“‹ STEP 1: LINTING CHECKS") +print("-" * 80) + +for check_name, command in lint_checks.items(): + print(f"\n{check_name.upper()}:") + result = subprocess.run(command, capture_output=True, text=True, timeout=60) + + output_file = PROBE_DIR / f"{check_name}_{timestamp}.txt" + output_file.write_text(result.stdout + "\n\n" + result.stderr) + + passed = result.returncode == 0 + results["lint_checks"][check_name] = { + "passed": passed, + "output_file": str(output_file), + "return_code": result.returncode, + } + + if passed: + print(" āœ… PASSED") + else: + print(f" āŒ FAILED (exit code {result.returncode})") + print(f" šŸ“ Output saved to: {output_file.name}") + +# Run test suites +print("\nšŸ“‹ STEP 2: TEST SUITES") +print("-" * 80) + +total_passed = 0 +total_failed = 0 + +for suite_name, test_path in test_suites.items(): + print(f"\n{suite_name.upper()} TESTS:") + + result = subprocess.run( + ["python", "-m", "pytest", test_path, "-v", "--tb=short"], + capture_output=True, + text=True, + timeout=300, # Increased timeout for readme tests + ) + + # Save to proper subdirectory + if suite_name == "root_readme": + output_file = PROBE_DIR / "root" / f"readme_{timestamp}.txt" + else: + output_file = PROBE_DIR / suite_name / f"all_{timestamp}.txt" + + output_file.write_text(result.stdout + "\n\n" + result.stderr) + + # Parse results + output = result.stdout + result.stderr + + # Extract pass/fail counts + import re + + match = re.search(r"(\d+) passed", output) + passed = int(match.group(1)) if match else 0 + + match = re.search(r"(\d+) failed", output) + failed = int(match.group(1)) if match else 0 + + match = re.search(r"(\d+) skipped", output) + skipped = int(match.group(1)) if match else 0 + + total_passed += passed + total_failed += failed + + results["test_suites"][suite_name] = { + "passed": passed, + "failed": failed, + "skipped": skipped, + "output_file": str(output_file), + "return_code": result.returncode, + } + + status = "āœ… PASSED" if failed == 0 else f"āŒ {failed} FAILED" + print(f" {status} - {passed} passed, {failed} failed, {skipped} skipped") + print(f" šŸ“ Output saved to: {output_file.relative_to(Path.cwd())}") + + # Also run individual test files for detailed inspection + if suite_name in ["unit", "e2e", "integration"]: + test_files = Path(test_path).glob("test_*.py") + for test_file in test_files: + individual_result = subprocess.run( + ["python", "-m", "pytest", str(test_file), "-v", "--tb=short"], + capture_output=True, + text=True, + timeout=60, + ) + + # Save individual test outputs + individual_output = PROBE_DIR / suite_name / f"{test_file.stem}_{timestamp}.txt" + individual_output.write_text( + individual_result.stdout + "\n\n" + individual_result.stderr + ) + +# Save summary +summary_file = PROBE_DIR / f"summary_{timestamp}.json" +results["summary"] = { + "total_tests_passed": total_passed, + "total_tests_failed": total_failed, + "all_linting_passed": all(v["passed"] for v in results["lint_checks"].values()), + "all_tests_passed": total_failed == 0, + "overall_status": ( + "PASS" + if (total_failed == 0 and all(v["passed"] for v in results["lint_checks"].values())) + else "FAIL" + ), +} + +summary_file.write_text(json.dumps(results, indent=2)) + +# Final summary +print("\n" + "=" * 80) +print("FINAL VALIDATION SUMMARY") +print("=" * 80) + +print("\nšŸ“Š TEST RESULTS:") +for suite, data in results["test_suites"].items(): + print(f" {suite:15} {data['passed']:4} passed, {data['failed']:4} failed") + +print(f"\n TOTAL: {total_passed:4} passed, {total_failed:4} failed") + +print("\nšŸ” LINTING:") +for check, data in results["lint_checks"].items(): + status = "āœ… PASS" if data["passed"] else "āŒ FAIL" + print(f" {check:15} {status}") + +print(f"\nšŸ“ All outputs saved to: {PROBE_DIR.absolute()}") +print(f"šŸ“„ Summary: {summary_file.name}") + +print("\n" + "=" * 80) +if results["summary"]["overall_status"] == "PASS": + print("šŸŽ‰ ALL VALIDATIONS PASSED - SDK IS 100% WORKING") +else: + print("āš ļø SOME VALIDATIONS FAILED - CHECK PROBE OUTPUTS") +print("=" * 80) + +# Exit with appropriate code +exit(0 if results["summary"]["overall_status"] == "PASS" else 1) diff --git a/tests/test_cli.sh b/tests/test_cli.sh new file mode 100755 index 0000000..03d6df8 --- /dev/null +++ b/tests/test_cli.sh @@ -0,0 +1,175 @@ +#!/bin/bash +# Comprehensive CLI Testing Script +# Tests all brightdata CLI commands to validate end-user experience + +set -e # Exit on error + +echo "================================================================================" +echo "COMPREHENSIVE CLI VALIDATION - Testing Real User Experience" +echo "================================================================================" +echo "Timestamp: $(date '+%Y%m%d_%H%M%S')" +echo "================================================================================" + +# Create probe directory structure for CLI tests +PROBE_DIR="probe/cli" +mkdir -p "$PROBE_DIR"/{scrape,search,help,errors} + +TIMESTAMP=$(date '+%Y%m%d_%H%M%S') +SUMMARY_FILE="$PROBE_DIR/cli_summary_$TIMESTAMP.txt" + +# Track results +TOTAL_TESTS=0 +PASSED_TESTS=0 +FAILED_TESTS=0 + +# Helper function to run CLI test +run_cli_test() { + local test_name=$1 + local command=$2 + local category=$3 + local output_file="$PROBE_DIR/$category/${test_name}_${TIMESTAMP}.txt" + + TOTAL_TESTS=$((TOTAL_TESTS + 1)) + + echo "" + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + echo "TEST: $test_name" + echo "COMMAND: $command" + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + + # Run command and save output + if eval "$command" > "$output_file" 2>&1; then + echo " āœ… PASSED" + echo " šŸ“ Output: $output_file" + PASSED_TESTS=$((PASSED_TESTS + 1)) + return 0 + else + EXIT_CODE=$? + echo " āŒ FAILED (exit code: $EXIT_CODE)" + echo " šŸ“ Error output: $output_file" + FAILED_TESTS=$((FAILED_TESTS + 1)) + return 1 + fi +} + +# ============================================================================= +# STEP 1: HELP COMMANDS +# ============================================================================= + +echo "" +echo "šŸ“‹ STEP 1: HELP & INFO COMMANDS" +echo "================================================================================" + +run_cli_test "help_main" "brightdata --help" "help" +run_cli_test "help_scrape" "brightdata scrape --help" "help" +run_cli_test "help_search" "brightdata search --help" "help" +run_cli_test "help_scrape_amazon" "brightdata scrape amazon --help" "help" +run_cli_test "help_search_amazon" "brightdata search amazon --help" "help" +run_cli_test "help_search_linkedin" "brightdata search linkedin --help" "help" + +# ============================================================================= +# STEP 2: SCRAPE COMMANDS (if we have test token - these will fail without real API) +# ============================================================================= + +echo "" +echo "šŸ“‹ STEP 2: SCRAPE COMMANDS (syntax validation)" +echo "================================================================================" +echo "Note: These test CLI syntax, not actual API calls (would need valid token)" + +# Test CLI syntax validation (will fail on auth but validates parsing) +run_cli_test "scrape_amazon_products_help" \ + "brightdata scrape amazon products --help" \ + "scrape" || true + +run_cli_test "scrape_linkedin_profiles_help" \ + "brightdata scrape linkedin profiles --help" \ + "scrape" || true + +run_cli_test "scrape_facebook_posts_help" \ + "brightdata scrape facebook --help" \ + "scrape" || true + +run_cli_test "scrape_instagram_profiles_help" \ + "brightdata scrape instagram --help" \ + "scrape" || true + +# ============================================================================= +# STEP 3: SEARCH COMMANDS (syntax validation) +# ============================================================================= + +echo "" +echo "šŸ“‹ STEP 3: SEARCH COMMANDS (syntax validation)" +echo "================================================================================" + +run_cli_test "search_google_help" \ + "brightdata search google --help" \ + "search" || true + +run_cli_test "search_linkedin_jobs_help" \ + "brightdata search linkedin jobs --help" \ + "search" || true + +# ============================================================================= +# STEP 4: FORMAT OPTIONS +# ============================================================================= + +echo "" +echo "šŸ“‹ STEP 4: OUTPUT FORMAT OPTIONS" +echo "================================================================================" + +# Test that --output-format is recognized +run_cli_test "format_json_help" \ + "brightdata scrape --help | grep 'output-format'" \ + "help" || true + +run_cli_test "format_generic_help" \ + "brightdata scrape generic --help" \ + "help" || true + +# ============================================================================= +# FINAL SUMMARY +# ============================================================================= + +echo "" +echo "================================================================================" +echo "CLI VALIDATION SUMMARY" +echo "================================================================================" + +{ + echo "Timestamp: $(date)" + echo "" + echo "TEST RESULTS:" + echo " Total: $TOTAL_TESTS" + echo " Passed: $PASSED_TESTS" + echo " Failed: $FAILED_TESTS" + echo "" + + if [ $FAILED_TESTS -eq 0 ]; then + echo "āœ… ALL CLI TESTS PASSED" + echo "" + echo "CLI is fully functional and ready for users!" + else + echo "āš ļø $FAILED_TESTS test(s) failed" + echo "" + echo "Check probe/cli/ directory for details" + fi + + echo "" + echo "šŸ“ All outputs saved to: probe/cli/" + echo "" + echo "Directory structure:" + find "$PROBE_DIR" -type f | sort + +} | tee "$SUMMARY_FILE" + +echo "" +echo "================================================================================" +if [ $FAILED_TESTS -eq 0 ]; then + echo "šŸŽ‰ CLI VALIDATION COMPLETE - ALL SYSTEMS GO" + exit 0 +else: + echo "āš ļø SOME CLI TESTS FAILED - CHECK OUTPUTS" + exit 1 +fi +echo "================================================================================" +