如何抓取需要登录账号的数据?

方法一: 先登录

方法二:携带登录好的数据(cookies, local_storage, session)

本文讲述第二种方法。

  1. 利用Chrome extension “Storage & Cookie Exporter” 下载已经登录网站的cookies, storage, session为json格式文件。
  2. 启动浏览器并注入cookie,, storage, session
  3. 先尝试只在context注入cookie,启动浏览器看能否正常进入账号,如果不行,就注入storage, session。
    async def _load_cookies_to_context(self, context):
        if not self._cookie_data:
            logging.debug("No cookie data available")
            return
        cookies_data = self._cookie_data.get("cookieStorageData", [])
        if cookies_data:
            playwright_cookies = [
                {
                    **cookie,
                    "expires": cookie["expirationDate"] if not cookie["session"] else -1,
                    "sameSite": cookie["sameSite"].capitalize() if cookie["sameSite"].lower() == "lax" or cookie[
                        "sameSite"].lower() == "strict" else "None"
                }
                for cookie in cookies_data
            ]
            await context.add_cookies(playwright_cookies)
        # Verify loaded data
        cookies = await context.cookies()
        logging.debug("Current Cookies:", cookies)
        logging.debug("Cookies loaded successfully")


这样可以正常启动浏览器和创建新context(),可以注入时区等参数

   async def scrape_store_data(self, store_data: StoreData, idx: int)->Optional[OutputStoreData] :
        async with async_playwright() as p:
            # Connect to the browser via Chrome DevTools Protocol
            browser = await p.chromium.launch(
                headless=False,  # 设置为 False 以查看浏览器界面
                args=["--disable-blink-features=AutomationControlled"],
            )
            # Get the existing context and create AgentQL wrapper
            # Wrap the page with AgentQL using the async wrapper
            context = await browser.new_context(
                locale='en-US',
                timezone_id='America/Los_Angeles',  # 指定时区为 America/Los_Angeles
                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            )
            await self._load_cookies_to_context(context)
            pg = await context.new_page()
            # await self._load_storage_to_page(pg)
            async with await agentql.wrap_async(pg) as agentql_page:
                try:

如果需要注入local storage及session,则需要在page注入:



    async def load_storage_to_page(page, root_path):
        # Create a new page
        page = await context.new_page()

        # Navigate to the target URL (required for local/session storage)
        await page.goto(target_url)

        # Load local storage from localStorageData
        local_storage_data = storage_data.get("localStorageData", {})
        if local_storage_data:
            for key, value in local_storage_data.items():
                # Convert value to string if it isn't already
                value_str = str(value) if not isinstance(value, str) else value
                await page.evaluate(f"() => localStorage.setItem('{key}', '{value_str}')")
            logging.debug("Local storage loaded successfully")

        # Load session storage from sessionStorageData
        session_storage_data = storage_data.get("sessionStorageData", {})
        if session_storage_data:
            for key, value in session_storage_data.items():
                # Convert value to string if it isn't already
                value_str = str(value) if not isinstance(value, str) else value
                await page.evaluate(f"() => sessionStorage.setItem('{key}', '{value_str}')")
            logging.debug("Session storage loaded successfully")

        # Verify loaded data
        cookies = await context.cookies()
        logging.debug("Current Cookies:", cookies)
        local_storage = await page.evaluate("() => Object.fromEntries(Object.entries(localStorage))")
        logging.debug("Current Local Storage:", local_storage)
        session_storage = await page.evaluate("() => Object.fromEntries(Object.entries(sessionStorage))")
        logging.debug("Current Session Storage:", session_storage)
启动context需要携带user_data_dir:
import shutil
user_data_dir="../output/user_data"
if not os.path.exists(self._user_data_dir):
    os.makedirs(self._user_data_dir, exist_ok=True)
else:
    shutil.rmtree(user_data_dir, ignore_errors=True)
context = await p.chromium.launch_persistent_context(
    user_data_dir,
    headless=False,
    user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)

至此,可以正常启动并自动进入账号了。

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注