snippet.shell
from time import sleep
 
from scrapy import signals
from scrapy.http import HtmlResponse
from scrapy.utils.python import to_bytes
 
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
 
from scrapy_demo2 import settings
from scrapy.utils.project import get_project_settings
 
 
class SeleniumMiddleware(object):
    def __init__(self):
        # Initialize the WebDriver
        # CHROMEDRIVER_PATH = "./crawler/drivers/chromedriver_78"
        CHROMEDRIVER_PATH = "/opt/homebrew/bin/chromedriver"
        WINDOW_SIZE = "1920,1080"
 
        chrome_options = Options()
        # chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument(f"--window-size={ WINDOW_SIZE }")
 
        settings = get_project_settings()
        proxy_server = settings.get("HTTP_PROXY")
        print(proxy_server)
        chrome_options.add_argument(f"--proxy-server=socks5://{proxy_server}")
 
        service = Service(executable_path=CHROMEDRIVER_PATH)
        driver = webdriver.Chrome(
            options=chrome_options,
            service=service,
        )
        self.driver = driver
 
    @classmethod
    def from_crawler(cls, crawler):
        middleware = cls()
        crawler.signals.connect(middleware.spider_opened, signals.spider_opened)
        crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
        return middleware
 
    def spider_opened(self, spider):
        # spider.logger.info("SeleniumMiddleware opened")
        pass
 
    def spider_closed(self, spider):
        self.driver.close()
 
    def process_request(self, request, spider):
        self.driver.get(request.url)
 
        sleep(5)
        body = to_bytes(text=self.driver.page_source)
 
        return HtmlResponse(
            url=request.url, body=body, encoding="utf-8", request=request
        )