2025年6月1日星期日

playwright将网页截图的方式

 from playwright.async_api import async_playwright

import os

import pytesseract
from PIL import Image
import asyncio


proxy = 'http://abc.com.cn:80'

async def download():
    async with async_playwright() as p:
        cookie = os.path.join(cookies_folder, "cookie.json")
        browser = await p.chromium.launch(
            headless=False,
            channel="chrome",
            proxy={"server": proxy}  # 设置代理?
        )
        context = await browser.new_context()
        page = await context.new_page()
        await page.goto("https://ddd.com/", timeout=180000)
        await page.wait_for_load_state("networkidle")
        # screenshot_path = 'fullpage.png'
        # await page.screenshot(path=screenshot_path, full_page=True)
        shadow_host = await page.query_selector('div#transcend-consent-manager')
        await shadow_host.screenshot(path='shadow_host.png')


        image = Image.open('shadow_host.png')
        data = pytesseract.image_to_data(image, lang='chi_sim+eng', output_type=pytesseract.Output.DICT)

没有评论:

发表评论

安装 PaddleOCR的方法

  安装飞桨版本:https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/windows-pip.html 命令:  python -m pip...