diff --git a/edsl/scenarios/FileStore.py b/edsl/scenarios/FileStore.py index 4fa6b6811..04b533e77 100644 --- a/edsl/scenarios/FileStore.py +++ b/edsl/scenarios/FileStore.py @@ -3,15 +3,17 @@ import tempfile import mimetypes import os -from typing import Dict, Any, IO, Optional +from typing import Dict, Any, IO, Optional, Literal, List import requests from urllib.parse import urlparse +import time import subprocess import google.generativeai as genai -from edsl import Scenario +from edsl import Scenario, ScenarioList from edsl.utilities.decorators import add_edsl_version, remove_edsl_version from edsl.utilities.utilities import is_notebook +import asyncio def view_docx(docx_path): @@ -541,6 +543,104 @@ def create_link(self, custom_filename=None, style=None): return ConstructDownloadLink(self).create_link(custom_filename, style) + @classmethod + async def _async_screenshot( + cls, + url: str, + full_page: bool = True, + wait_until: Literal[ + "load", "domcontentloaded", "networkidle", "commit" + ] = "networkidle", + download_path: Optional[str] = None, + ) -> "FileStore": + """Async version of screenshot functionality""" + try: + from playwright.async_api import async_playwright + except ImportError: + raise ImportError( + "Screenshot functionality requires additional dependencies.\n" + "Install them with: pip install 'edsl[screenshot]'" + ) + + if download_path is None: + download_path = os.path.join( + os.getcwd(), f"screenshot_{int(time.time())}.png" + ) + + async with async_playwright() as p: + browser = await p.chromium.launch() + page = await browser.new_page() + await page.goto(url, wait_until=wait_until) + await page.screenshot(path=download_path, full_page=full_page) + await browser.close() + + return cls(download_path, mime_type="image/png") + + @classmethod + def from_url_screenshot(cls, url: str, **kwargs) -> "FileStore": + """Synchronous wrapper for screenshot functionality""" + import asyncio + + try: + # Try using get_event_loop first (works in regular Python) + loop = asyncio.get_event_loop() + except RuntimeError: + # If we're in IPython/Jupyter, create a new loop + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + try: + return loop.run_until_complete(cls._async_screenshot(url, **kwargs)) + finally: + if not loop.is_running(): + loop.close() + + @classmethod + def batch_screenshots(cls, urls: List[str], **kwargs) -> "ScenarioList": + """ + Take screenshots of multiple URLs concurrently. + + Args: + urls: List of URLs to screenshot + **kwargs: Additional arguments passed to screenshot function (full_page, wait_until, etc.) + + Returns: + ScenarioList containing FileStore objects with their corresponding URLs + """ + from edsl import ScenarioList + + try: + # Try using get_event_loop first (works in regular Python) + loop = asyncio.get_event_loop() + except RuntimeError: + # If we're in IPython/Jupyter, create a new loop + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + # Create tasks for all screenshots + tasks = [cls._async_screenshot(url, **kwargs) for url in urls] + + try: + # Run all screenshots concurrently + results = loop.run_until_complete( + asyncio.gather(*tasks, return_exceptions=True) + ) + + # Filter out any errors and log them + successful_results = [] + for url, result in zip(urls, results): + if isinstance(result, Exception): + print(f"Failed to screenshot {url}: {result}") + else: + successful_results.append( + Scenario({"url": url, "screenshot": result}) + ) + + return ScenarioList(successful_results) + finally: + if not loop.is_running(): + loop.close() + class CSVFileStore(FileStore): @classmethod diff --git a/edsl/utilities/setup_utils.py b/edsl/utilities/setup_utils.py new file mode 100644 index 000000000..402fc8b36 --- /dev/null +++ b/edsl/utilities/setup_utils.py @@ -0,0 +1,26 @@ +import subprocess +import sys +import os + + +class PlaywrightInstallPlugin: + def install_browsers(self): + print("Installing Playwright browsers...") + try: + result = subprocess.run( + [sys.executable, "-m", "playwright", "install", "chromium"], + check=True, + capture_output=True, + text=True, + ) + print("Successfully installed Playwright browsers") + if result.stdout: + print(f"Output: {result.stdout}") + except subprocess.CalledProcessError as e: + print(f"Failed to install Playwright browsers: {e}") + if e.stdout: + print(f"Output: {e.stdout}") + if e.stderr: + print(f"Error: {e.stderr}") + except Exception as e: + print(f"Unexpected error during Playwright installation: {e}") diff --git a/poetry.lock b/poetry.lock index 9dd27e8a1..c3c8cc69b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3823,6 +3823,26 @@ docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.0.2)", "sphinx-a test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"] type = ["mypy (>=1.11.2)"] +[[package]] +name = "playwright" +version = "1.49.0" +description = "A high-level API to automate web browsers" +optional = false +python-versions = ">=3.9" +files = [ + {file = "playwright-1.49.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:704532a2d8ba580ec9e1895bfeafddce2e3d52320d4eb8aa38e80376acc5cbb0"}, + {file = "playwright-1.49.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:e453f02c4e5cc2db7e9759c47e7425f32e50ac76c76b7eb17c69eed72f01c4d8"}, + {file = "playwright-1.49.0-py3-none-macosx_11_0_universal2.whl", hash = "sha256:37ae985309184472946a6eb1a237e5d93c9e58a781fa73b75c8751325002a5d4"}, + {file = "playwright-1.49.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:68d94beffb3c9213e3ceaafa66171affd9a5d9162e0c8a3eed1b1132c2e57598"}, + {file = "playwright-1.49.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f12d2aecdb41fc25a624cb15f3e8391c252ebd81985e3d5c1c261fe93779345"}, + {file = "playwright-1.49.0-py3-none-win32.whl", hash = "sha256:91103de52d470594ad375b512d7143fa95d6039111ae11a93eb4fe2f2b4a4858"}, + {file = "playwright-1.49.0-py3-none-win_amd64.whl", hash = "sha256:34d28a2c2d46403368610be4339898dc9c34eb9f7c578207b4715c49743a072a"}, +] + +[package.dependencies] +greenlet = "3.1.1" +pyee = "12.0.0" + [[package]] name = "pluggy" version = "1.5.0" @@ -4295,6 +4315,23 @@ dev = ["black", "chardet"] release = ["zest.releaser[recommended]"] tests = ["black", "chardet", "tox"] +[[package]] +name = "pyee" +version = "12.0.0" +description = "A rough port of Node.js's EventEmitter to Python with a few tricks of its own" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pyee-12.0.0-py3-none-any.whl", hash = "sha256:7b14b74320600049ccc7d0e0b1becd3b4bd0a03c745758225e31a59f4095c990"}, + {file = "pyee-12.0.0.tar.gz", hash = "sha256:c480603f4aa2927d4766eb41fa82793fe60a82cbfdb8d688e0d08c55a534e145"}, +] + +[package.dependencies] +typing-extensions = "*" + +[package.extras] +dev = ["black", "build", "flake8", "flake8-black", "isort", "jupyter-console", "mkdocs", "mkdocs-include-markdown-plugin", "mkdocstrings[python]", "pytest", "pytest-asyncio", "pytest-trio", "sphinx", "toml", "tox", "trio", "trio", "trio-typing", "twine", "twisted", "validate-pyproject[all]"] + [[package]] name = "pygments" version = "2.18.0" @@ -6049,7 +6086,10 @@ enabler = ["pytest-enabler (>=2.2)"] test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"] type = ["pytest-mypy"] +[extras] +screenshot = ["playwright"] + [metadata] lock-version = "2.0" python-versions = ">=3.9.1,<3.13" -content-hash = "5fdfb0396d755e3c10f912d5a09bd7ebd29ae45442556747fcca5e44729ffb5a" +content-hash = "c8e407896e87e162a72a48e8aba9420e7f8871dd6adb165b6608e4a90162a894" diff --git a/pyproject.toml b/pyproject.toml index fc8c74dea..b2c7f423b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,7 @@ mistralai = "^1.0.2" urllib3 = ">=1.25.4,<1.27" google-generativeai = "^0.8.2" tabulate = "^0.9.0" +playwright = { version = "^1.40.0", optional = true } [tool.poetry.dependencies.black] extras = ["jupyter"] @@ -81,3 +82,15 @@ uvicorn = "^0.30.6" [tool.tomlsort.overrides."tool.poetry.dependencies"] table_keys = false + +[tool.poetry.group.screenshot] +optional = true + +[tool.poetry.group.screenshot.dependencies] +playwright = "^1.40.0" + +[tool.poetry.plugins."poetry.application.plugin"] +install-playwright-browsers = "edsl.utilities.setup_utils:PlaywrightInstallPlugin" + +[tool.poetry.extras] +screenshot = ["playwright"]