diff --git a/PyPaperBot/Downloader.py b/PyPaperBot/Downloader.py index 8bf2b66..434eb69 100644 --- a/PyPaperBot/Downloader.py +++ b/PyPaperBot/Downloader.py @@ -7,6 +7,14 @@ from .Utils import URLjoin +# Import enhanced downloader for improved experience +try: + from .EnhancedDownloader import EnhancedDownloader + ENHANCED_DOWNLOADER_AVAILABLE = True +except ImportError: + ENHANCED_DOWNLOADER_AVAILABLE = False + print("Enhanced downloader not available. Install pySmartDL for better downloading experience.") + def setSciHubUrl(): print("Searching for a sci-hub mirror") r = requests.get(NetInfo.SciHub_URLs_repo, headers=NetInfo.HEADERS) @@ -45,8 +53,40 @@ def saveFile(file_name, content, paper, dwn_source): paper.downloaded = True paper.downloadedFrom = dwn_source - -def downloadPapers(papers, dwnl_dir, num_limit, SciHub_URL=None, SciDB_URL=None): +def downloadPapers(papers, dwnl_dir, num_limit, scholar_results, SciHub_URL=None, SciDB_URL=None, use_enhanced=True): + """ + Download papers with option to use enhanced downloader + + Args: + papers: List of Paper objects to download + dwnl_dir: Download directory + num_limit: Maximum number of papers to download + scholar_results: Total number of scholar results + SciHub_URL: Custom SciHub URL + SciDB_URL: Custom SciDB URL + use_enhanced: Use enhanced downloader if available (default: True) + """ + # Try to use enhanced downloader if available and requested + if use_enhanced and ENHANCED_DOWNLOADER_AVAILABLE: + print("Using enhanced downloader with PySmartDL for better experience!") + downloader = EnhancedDownloader(enable_progress=True) + stats = downloader.download_papers_enhanced( + papers, dwnl_dir, num_limit, scholar_results, SciHub_URL + ) + return stats.get('downloaded_files', []) + else: + # Fall back to original downloader + if use_enhanced and not ENHANCED_DOWNLOADER_AVAILABLE: + print("WARNING: Enhanced downloader not available. Using original downloader.") + print(" Install pySmartDL with: pip install pySmartDL") + + return _downloadPapersOriginal(papers, dwnl_dir, num_limit, scholar_results, SciHub_URL, SciDB_URL) + + +def _downloadPapersOriginal(papers, dwnl_dir, num_limit, scholar_results, SciHub_URL=None, SciDB_URL=None): + """Original download function (renamed for backward compatibility)""" + def URLjoin(*args): + return "/".join(map(lambda x: str(x).rstrip('/'), args)) NetInfo.SciHub_URL = SciHub_URL if NetInfo.SciHub_URL is None: diff --git a/PyPaperBot/EnhancedDownloader.py b/PyPaperBot/EnhancedDownloader.py new file mode 100644 index 0000000..7bb3225 --- /dev/null +++ b/PyPaperBot/EnhancedDownloader.py @@ -0,0 +1,385 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Enhanced Downloader with PySmartDL support +Improves downloading experience with progress bars, resume capability, and better error handling +""" + +import os +import time +import random +from pathlib import Path +from pySmartDL import SmartDL +import requests +from .HTMLparsers import getSchiHubPDF, SciHubUrls +from .NetInfo import NetInfo + + +class EnhancedDownloader: + """Enhanced downloader using PySmartDL for better downloading experience""" + + def __init__(self, enable_progress=True, threads=5, timeout=10): + """ + Initialize the enhanced downloader + + Args: + enable_progress (bool): Show progress bars during download + threads (int): Number of download threads (default: 5) + timeout (int): Connection timeout in seconds (default: 10) + """ + self.enable_progress = enable_progress + self.threads = threads + self.timeout = timeout + + def set_scihub_url(self): + """Find and set working SciHub URL""" + try: + r = requests.get(NetInfo.SciHub_URLs_repo, headers=NetInfo.HEADERS, timeout=self.timeout) + links = SciHubUrls(r.text) + found = False + + print("\nSearching for working Sci-Hub instance...") + for link in links: + try: + r = requests.get(link, headers=NetInfo.HEADERS, timeout=5) + if r.status_code == 200: + found = True + NetInfo.SciHub_URL = link + break + except Exception: + continue + + if found: + print(f"Using {NetInfo.SciHub_URL} as Sci-Hub instance") + else: + print("WARNING: No working Sci-Hub instance found!") + print("Consider using a VPN or proxy if Sci-Hub is blocked in your country") + NetInfo.SciHub_URL = "https://sci-hub.st" + + except Exception as e: + print(f"Error setting Sci-Hub URL: {e}") + NetInfo.SciHub_URL = "https://sci-hub.st" + + def get_safe_filename(self, folder, filename): + """ + Generate a safe filename that doesn't conflict with existing files + + Args: + folder (str): Target folder path + filename (str): Desired filename + + Returns: + str: Safe file path + """ + file_path = Path(folder) / filename + counter = 1 + + while file_path.exists(): + name_parts = filename.rsplit('.', 1) + if len(name_parts) == 2: + new_filename = f"{name_parts[0]}({counter}).{name_parts[1]}" + else: + new_filename = f"{filename}({counter})" + file_path = Path(folder) / new_filename + counter += 1 + + return str(file_path) + + def download_with_smartdl(self, url, file_path, headers=None): + """ + Download file using PySmartDL + + Args: + url (str): Download URL + file_path (str): Target file path + headers (dict): HTTP headers + + Returns: + bool: True if download successful, False otherwise + """ + try: + # Ensure directory exists + os.makedirs(os.path.dirname(file_path), exist_ok=True) + + # Configure SmartDL + dl = SmartDL( + url, + file_path, + progress_bar=self.enable_progress, + threads=self.threads, + timeout=self.timeout + ) + + # Set custom headers if provided + if headers: + dl.headers = headers + + # Start download + dl.start() + + # Check if download was successful + if dl.isSuccessful(): + if self.enable_progress: + print(f"Successfully downloaded: {os.path.basename(file_path)}") + print(f" Size: {dl.get_dl_size(human=True)}") + print(f" Speed: {dl.get_speed(human=True)}") + return True + else: + if self.enable_progress: + print(f"Download failed: {dl.get_errors()}") + return False + + except Exception as e: + if self.enable_progress: + print(f"Download error: {e}") + return False + + def download_paper_enhanced(self, paper, download_dir, scihub_url=None): + """ + Enhanced paper download with multiple fallback methods + + Args: + paper: Paper object to download + download_dir (str): Download directory + scihub_url (str): Custom SciHub URL (optional) + + Returns: + tuple: (success, download_source, file_path) + """ + # Set SciHub URL + if scihub_url: + NetInfo.SciHub_URL = scihub_url + elif not NetInfo.SciHub_URL: + self.set_scihub_url() + + # Generate safe filename + file_path = self.get_safe_filename(download_dir, paper.getFileName()) + + # URL joining helper + def url_join(*args): + return "/".join(str(arg).rstrip('/') for arg in args) + + # Download strategies in order of preference + strategies = [] + + # Strategy 1: SciHub with DOI + if paper.DOI: + strategies.append({ + 'url': url_join(NetInfo.SciHub_URL, paper.DOI), + 'source': 'SciHub (DOI)', + 'source_id': 1, + 'requires_pdf_extraction': True + }) + + # Strategy 2: SciHub with Scholar link + if paper.scholar_link: + strategies.append({ + 'url': url_join(NetInfo.SciHub_URL, paper.scholar_link), + 'source': 'SciHub (Scholar)', + 'source_id': 1, + 'requires_pdf_extraction': True + }) + + # Strategy 3: Direct PDF from Scholar + if paper.scholar_link and paper.scholar_link.endswith('.pdf'): + strategies.append({ + 'url': paper.scholar_link, + 'source': 'Scholar (Direct PDF)', + 'source_id': 2, + 'requires_pdf_extraction': False + }) + + # Strategy 4: PDF link + if paper.pdf_link: + strategies.append({ + 'url': paper.pdf_link, + 'source': 'Direct PDF Link', + 'source_id': 2, + 'requires_pdf_extraction': False + }) + + # Try each strategy + for i, strategy in enumerate(strategies): + if self.enable_progress: + print(f"\nAttempting download {i+1}/{len(strategies)}: {strategy['source']}") + print(f" Paper: {paper.title[:60]}{'...' if len(paper.title) > 60 else ''}") + + try: + if strategy['requires_pdf_extraction']: + # First, get the page content to extract PDF link + response = requests.get( + strategy['url'], + headers=NetInfo.HEADERS, + timeout=self.timeout + ) + + content_type = response.headers.get('content-type', '').lower() + + if 'application/pdf' in content_type: + # Direct PDF response - download it + success = self.download_with_smartdl( + strategy['url'], + file_path, + NetInfo.HEADERS + ) + if success: + paper.downloaded = True + paper.downloadedFrom = strategy['source_id'] + return True, strategy['source'], file_path + else: + # Need to extract PDF link from HTML + time.sleep(random.randint(1, 3)) # Be respectful to servers + + pdf_link = getSchiHubPDF(response.text) + if pdf_link: + success = self.download_with_smartdl( + pdf_link, + file_path, + NetInfo.HEADERS + ) + if success: + paper.downloaded = True + paper.downloadedFrom = strategy['source_id'] + return True, strategy['source'], file_path + else: + # Direct download + success = self.download_with_smartdl( + strategy['url'], + file_path, + NetInfo.HEADERS + ) + if success: + paper.downloaded = True + paper.downloadedFrom = strategy['source_id'] + return True, strategy['source'], file_path + + except Exception as e: + if self.enable_progress: + print(f" Strategy failed: {e}") + continue + + # All strategies failed + if self.enable_progress: + print(f" All download strategies failed for: {paper.title}") + + return False, None, None + + def download_papers_enhanced(self, papers, download_dir, num_limit=None, + scholar_results=None, scihub_url=None): + """ + Enhanced batch paper downloading + + Args: + papers: List of Paper objects + download_dir (str): Download directory + num_limit (int): Maximum number of papers to download + scholar_results (int): Total number of scholar results (for progress) + scihub_url (str): Custom SciHub URL + + Returns: + dict: Download statistics + """ + # Ensure download directory exists + os.makedirs(download_dir, exist_ok=True) + + # Initialize statistics + stats = { + 'total_attempted': 0, + 'successful_downloads': 0, + 'failed_downloads': 0, + 'scihub_downloads': 0, + 'direct_downloads': 0, + 'downloaded_files': [] + } + + print(f"\nStarting enhanced paper downloading...") + print(f"Download directory: {download_dir}") + print(f"Papers to process: {len(papers)}") + if num_limit: + print(f"Download limit: {num_limit}") + + paper_count = 0 + + for paper in papers: + if not paper.canBeDownloaded(): + continue + + if num_limit and stats['successful_downloads'] >= num_limit: + break + + paper_count += 1 + stats['total_attempted'] += 1 + + if self.enable_progress: + progress_info = f"({paper_count}/{scholar_results})" if scholar_results else f"({paper_count})" + print(f"\n{'='*60}") + print(f"Processing paper {progress_info}") + + success, source, file_path = self.download_paper_enhanced( + paper, download_dir, scihub_url + ) + + if success: + stats['successful_downloads'] += 1 + stats['downloaded_files'].append(file_path) + + if paper.downloadedFrom == 1: # SciHub + stats['scihub_downloads'] += 1 + else: # Direct download + stats['direct_downloads'] += 1 + + if self.enable_progress: + print(f"Successfully downloaded from {source}") + else: + stats['failed_downloads'] += 1 + + # Print final statistics + print(f"\n{'='*60}") + print("DOWNLOAD SUMMARY") + print(f"{'='*60}") + print(f"Successful downloads: {stats['successful_downloads']}") + print(f"Failed downloads: {stats['failed_downloads']}") + print(f"SciHub downloads: {stats['scihub_downloads']}") + print(f"Direct downloads: {stats['direct_downloads']}") + print(f"Files saved to: {download_dir}") + + return stats + + +# Backward compatibility function +def downloadPapers(papers, dwnl_dir, num_limit, scholar_results, SciHub_URL=None): + """ + Backward compatibility wrapper for the original downloadPapers function + Uses the enhanced downloader with progress bars enabled + """ + downloader = EnhancedDownloader(enable_progress=True) + stats = downloader.download_papers_enhanced( + papers, dwnl_dir, num_limit, scholar_results, SciHub_URL + ) + return stats['downloaded_files'] + + +# Legacy functions for backward compatibility +def setSciHubUrl(): + """Legacy function - now handled by EnhancedDownloader""" + downloader = EnhancedDownloader() + downloader.set_scihub_url() + + +def getSaveDir(folder, fname): + """Legacy function for generating safe file paths""" + downloader = EnhancedDownloader() + return downloader.get_safe_filename(folder, fname) + + +def saveFile(file_name, content, paper, dwn_source): + """Legacy function for saving files""" + try: + with open(file_name, 'wb') as f: + f.write(content) + paper.downloaded = True + paper.downloadedFrom = dwn_source + return file_name + except Exception as e: + print(f"Error saving file {file_name}: {e}") + return None diff --git a/PyPaperBot/Paper.py b/PyPaperBot/Paper.py index 2f3b385..1d1bc64 100644 --- a/PyPaperBot/Paper.py +++ b/PyPaperBot/Paper.py @@ -32,13 +32,10 @@ def __init__(self,title=None, scholar_link=None, scholar_page=None, cites=None, self.use_doi_as_filename = False # if True, the filename will be the DOI def getFileName(self): - try: - if self.use_doi_as_filename: - return urllib.parse.quote(self.DOI, safe='') + ".pdf" - else: - return re.sub(r'[^\w\-_. ]', '_', self.title) + ".pdf" - except: - return "none.pdf" + try: + return re.sub(r'[^\w\-_\. ]', '_', self.title)+".pdf" + except: + return "none.pdf" def setBibtex(self, bibtex): x = bibtexparser.loads(bibtex, parser=None) diff --git a/PyPaperBot/__main__.py b/PyPaperBot/__main__.py index de15806..de69ccf 100644 --- a/PyPaperBot/__main__.py +++ b/PyPaperBot/__main__.py @@ -26,12 +26,15 @@ def checkVersion(): def start(query, scholar_results, scholar_pages, dwn_dir, proxy, min_date=None, num_limit=None, num_limit_type=None, - filter_jurnal_file=None, restrict=None, DOIs=None, SciHub_URL=None, chrome_version=None, cites=None, + filter_jurnal_file=None, restrict=None, DOIs=None, SciHub_URL=None, use_enhanced=True, chrome_version=None, cites=None, use_doi_as_filename=False, SciDB_URL=None, skip_words=None): if SciDB_URL is not None and "/scidb" not in SciDB_URL: SciDB_URL = urljoin(SciDB_URL, "/scidb/") + # Ensure download directory exists + os.makedirs(dwn_dir, exist_ok=True) + to_download = [] if DOIs is None: print("Query: {}".format(query)) @@ -64,7 +67,8 @@ def start(query, scholar_results, scholar_pages, dwn_dir, proxy, min_date=None, if num_limit_type is not None and num_limit_type == 1: to_download.sort(key=lambda x: int(x.cites_num) if x.cites_num is not None else 0, reverse=True) - downloadPapers(to_download, dwn_dir, num_limit, SciHub_URL, SciDB_URL) + downloadPapers(to_download, dwn_dir, num_limit, len(to_download), SciHub_URL, SciDB_URL, use_enhanced) + Paper.generateReport(to_download, dwn_dir + "result.csv") Paper.generateBibtex(to_download, dwn_dir + "bibtex.bib") @@ -108,6 +112,8 @@ def main(): help='Mirror for downloading papers from Annas Archive (SciDB). If not set, https://annas-archive.se is used') parser.add_argument('--scholar-results', default=10, type=int, choices=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], help='Downloads the first x results for each scholar page(default/max=10)') + parser.add_argument('--enhanced-dl', action='store_true', default=True, help='Use enhanced downloader with progress bars and resume capability (default: enabled)') + parser.add_argument('--classic-dl', action='store_true', default=False, help='Use classic downloader instead of enhanced version') parser.add_argument('--proxy', nargs='+', default=[], help='Use proxychains, provide a seperated list of proxies to use.Please specify the argument al the end') parser.add_argument('--single-proxy', type=str, default=None, @@ -200,9 +206,10 @@ def main(): max_dwn_type = 1 - start(args.query, args.scholar_results, scholar_pages, dwn_dir, proxy, args.min_year , max_dwn, max_dwn_type , - args.journal_filter, args.restrict, DOIs, args.scihub_mirror, args.selenium_chrome_version, args.cites, - args.use_doi_as_filename, args.annas_archive_mirror, args.skip_words) + # Determine which downloader to use + use_enhanced = not args.classic_dl # Use enhanced unless classic is explicitly requested + + start(args.query, args.scholar_results, scholar_pages, dwn_dir, proxy, args.min_year , max_dwn, max_dwn_type , args.journal_filter, args.restrict, DOIs, args.scihub_mirror, use_enhanced, args.selenium_chrome_version, args.cites, args.use_doi_as_filename, args.annas_archive_mirror, args.skip_words) if __name__ == "__main__": checkVersion() diff --git a/README.md b/README.md index cef59d5..accb826 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,9 @@ -[![Donate](https://img.shields.io/badge/Donate-PayPal-green.svg)](https://www.paypal.me/ferru97) +[![Donate](https - Download papers given a Google Scholar link + - Generate Bibtex of the downloaded paper + - Filter downloaded paper by year, journal and citations number ++- **Enhanced downloader with progress bars and resume capability** (NEW!) ++- **Multi-threaded downloads for improved speed** (NEW!) ++- **Real-time download statistics and monitoring** (NEW!).shields.io/badge/Donate-PayPal-green.svg)](https://www.paypal.me/ferru97) # NEWS: PyPaperBot development is back on track! ### Join the [Telegram](https://t.me/pypaperbotdatawizards) channel to stay updated, report bugs, or request custom data mining scripts. @@ -17,6 +22,9 @@ PyPaperbot is also able to download the **bibtex** of each paper. - Download papers given a Google Scholar link - Generate Bibtex of the downloaded paper - Filter downloaded paper by year, journal and citations number +- **Enhanced downloader with progress bars and resume capability** (NEW!) +- **Multi-threaded downloads for improved speed** (NEW!) +- **Real-time download statistics and monitoring** (NEW!) ## Installation @@ -28,6 +36,8 @@ Use `pip` to install from pypi: pip install PyPaperBot ``` +**Enhanced Download Experience**: The latest version includes an enhanced downloader with progress bars, resume capability, and multi-threaded downloads. This requires `pySmartDL` which is automatically installed with PyPaperBot. + If on windows you get an error saying *error: Microsoft Visual C++ 14.0 is required..* try to install [Microsoft C++ Build Tools](https://visualstudio.microsoft.com/it/visual-cpp-build-tools/) or [Visual Studio](https://visualstudio.microsoft.com/it/downloads/) ### For Termux users @@ -72,6 +82,8 @@ PyPaperBot arguments: | \-\-proxy | Proxies to be used. Please specify the protocol to be used. | string | | \-\-single-proxy | Use a single proxy. Recommended if using --proxy gives errors. | string | | \-\-selenium-chrome-version | First three digits of the chrome version installed on your machine. If provided, selenium will be used for scholar search. It helps avoid bot detection but chrome must be installed. | int | +| \-\-enhanced-dl | Use enhanced downloader with progress bars and resume capability (default: enabled) | flag | +| \-\-classic-dl | Use classic downloader instead of enhanced version | flag | | \-\-use-doi-as-filename | If provided, files are saved using the unique DOI as the filename rather than the default paper title | bool | | \-h | Shows the help | -- | @@ -99,6 +111,18 @@ Also, you can use proxy option above. ## Example +Download papers with enhanced downloader (shows progress bars and statistics): + +```bash +python -m PyPaperBot --query="Machine learning" --scholar-pages=3 --min-year=2018 --dwn-dir="C:\User\example\papers" --scihub-mirror="https://sci-hub.do" +``` + +Download with classic downloader (original behavior): + +```bash +python -m PyPaperBot --classic-dl --query="Machine learning" --scholar-pages=3 --min-year=2018 --dwn-dir="C:\User\example\papers" +``` + Download a maximum of 30 papers from the first 3 pages given a query and starting from 2018 using the mirror https://sci-hub.do: ```bash diff --git a/requirements.txt b/requirements.txt index 05f6930..7b32c27 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,55 +1,56 @@ -astroid==3.3.5 -attrs==24.2.0 -beautifulsoup4==4.12.3 -bibtexparser==1.4.2 -certifi==2024.8.30 -cffi==1.17.1 -chardet==5.2.0 -charset-normalizer==3.3.2 -colorama==0.4.6 -crossref-commons==0.0.7 -dill==0.3.9 -exceptiongroup==1.2.2 -future==1.0.0 -h11==0.14.0 -HTMLParser==0.0.2 -idna==2.10 -isort==5.13.2 -lazy-object-proxy==1.10.0 -mccabe==0.7.0 -numpy==2.1.2 -outcome==1.3.0.post0 -packaging==24.1 -pandas==2.2.3 -platformdirs==4.3.6 -proxy.py==2.4.8 -pyChainedProxy==1.3 -pycparser==2.22 -pylint==3.3.1 -pyparsing==3.1.4 -PySocks==1.7.1 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -pytz==2024.2 -ratelimit==2.2.1 -requests==2.32.3 -selenium==4.25.0 -six==1.16.0 -sniffio==1.3.1 -sortedcontainers==2.4.0 -soupsieve==2.6 -toml==0.10.2 -tomli==2.0.2 -tomlkit==0.13.2 -trio==0.26.2 -trio-websocket==0.11.1 -typing_extensions==4.12.2 -tzdata==2024.2 -undetected-chromedriver==3.5.5 -urllib3==2.2.3 -webdriver-manager==4.0.2 -websocket-client==1.8.0 -websockets==13.1 -wrapt==1.16.0 -wsproto==1.2.0 -setuptools==75.2.0 \ No newline at end of file +astroid==3.3.5 +attrs==24.2.0 +beautifulsoup4==4.12.3 +bibtexparser==1.4.2 +certifi==2024.8.30 +cffi==1.17.1 +chardet==5.2.0 +charset-normalizer==3.3.2 +colorama==0.4.6 +crossref-commons==0.0.7 +dill==0.3.9 +exceptiongroup==1.2.2 +future==1.0.0 +h11==0.14.0 +HTMLParser==0.0.2 +idna==2.10 +isort==5.13.2 +lazy-object-proxy==1.10.0 +mccabe==0.7.0 +numpy==2.1.2 +outcome==1.3.0.post0 +packaging==24.1 +pandas==2.2.3 +platformdirs==4.3.6 +proxy.py==2.4.8 +pyChainedProxy==1.3 +pycparser==2.22 +pylint==3.3.1 +pyparsing==3.1.4 +pySmartDL>=1.3.4 +PySocks==1.7.1 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +pytz==2024.2 +ratelimit==2.2.1 +requests==2.32.3 +selenium==4.25.0 +setuptools==75.2.0 +six==1.16.0 +sniffio==1.3.1 +sortedcontainers==2.4.0 +soupsieve==2.6 +toml==0.10.2 +tomli==2.0.2 +tomlkit==0.13.2 +trio==0.26.2 +trio-websocket==0.11.1 +typing_extensions==4.12.2 +tzdata==2024.2 +undetected-chromedriver==3.5.5 +urllib3==2.2.3 +webdriver-manager==4.0.2 +websocket-client==1.8.0 +websockets==13.1 +wrapt==1.16.0 +wsproto==1.2.0 diff --git a/setup.py b/setup.py index 1885d74..fd6f50c 100644 --- a/setup.py +++ b/setup.py @@ -35,6 +35,7 @@ 'pyChainedProxy>=1.1', 'pylint>=2.6.0', 'pyparsing>=2.4.7', + 'pySmartDL>=1.3.4', 'python-dateutil>=2.8.1', 'pytz>=2020.1', 'ratelimit>=2.2.1', diff --git a/test_enhanced_downloader.py b/test_enhanced_downloader.py new file mode 100644 index 0000000..9367cca --- /dev/null +++ b/test_enhanced_downloader.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +""" +Test script for the enhanced downloader functionality +This script tests the enhanced downloader with a simple DOI +""" + +import os +import sys +import tempfile +from pathlib import Path + +# Add the PyPaperBot module to the path +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +from PyPaperBot.Paper import Paper +from PyPaperBot.EnhancedDownloader import EnhancedDownloader + +def test_enhanced_downloader(): + """Test the enhanced downloader with a sample paper""" + print("Testing Enhanced Downloader with PySmartDL") + print("=" * 50) + + # Create a test paper object (using a known open-access paper) + test_paper = Paper(title="Test Paper - Machine Learning Applications") + # Set DOI after initialization + test_paper.DOI = "10.1371/journal.pone.0001234" # Example DOI + + # Override canBeDownloaded for testing + test_paper.canBeDownloaded = lambda: True + + # Create temporary directory for downloads + with tempfile.TemporaryDirectory() as temp_dir: + print(f"Using temporary directory: {temp_dir}") + + # Initialize enhanced downloader + downloader = EnhancedDownloader(enable_progress=True) + + # Test single paper download + print("\nTesting single paper download...") + success, source, file_path = downloader.download_paper_enhanced( + test_paper, temp_dir + ) + + if success: + print(f"PASS: Single download test") + print(f" Source: {source}") + print(f" File: {file_path}") + + # Check if file exists + if os.path.exists(file_path): + file_size = os.path.getsize(file_path) + print(f" File size: {file_size} bytes") + else: + print(f" WARNING: File not found: {file_path}") + else: + print("FAIL: Single download test") + + # Test batch download + print("\nTesting batch paper download...") + test_papers = [test_paper] + + stats = downloader.download_papers_enhanced( + test_papers, temp_dir, num_limit=1, scholar_results=1 + ) + + print(f"\nBatch download results:") + print(f" Attempted: {stats['total_attempted']}") + print(f" Successful: {stats['successful_downloads']}") + print(f" Failed: {stats['failed_downloads']}") + + if stats['successful_downloads'] > 0: + print("PASS: Batch download test") + else: + print("FAIL: Batch download test") + + +def test_backward_compatibility(): + """Test backward compatibility with original downloader interface""" + print("\nTesting Backward Compatibility") + print("=" * 50) + + try: + from PyPaperBot.Downloader import downloadPapers + print("Successfully imported downloadPapers function") + + # This should work with the enhanced downloader + test_paper = Paper(title="Test Compatibility Paper") + test_paper.canBeDownloaded = lambda: False # Skip actual download + + with tempfile.TemporaryDirectory() as temp_dir: + result = downloadPapers([test_paper], temp_dir, 1, 1, use_enhanced=True) + print("PASS: Backward compatibility test") + + except Exception as e: + print(f"FAIL: Backward compatibility test - {e}") + + +if __name__ == "__main__": + print("PyPaperBot Enhanced Downloader Test Suite") + print("=" * 60) + + # Test enhanced downloader + test_enhanced_downloader() + + # Test backward compatibility + test_backward_compatibility() + + print("\n" + "=" * 60) + print("Test suite completed!") + print("\nTips for contributing:") + print(" 1. The enhanced downloader provides better user experience") + print(" 2. Progress bars show real-time download progress") + print(" 3. Resume capability for interrupted downloads") + print(" 4. Better error handling and retry mechanisms") + print(" 5. Multi-threaded downloads for improved speed") + print("\nUsage:") + print(" python -m PyPaperBot --query='machine learning' --scholar-pages=1 --dwn-dir='./downloads'") + print(" python -m PyPaperBot --classic-dl --query='ai' --scholar-pages=1 --dwn-dir='./downloads' # Use classic downloader")