from time import sleep from scrapy import signals from scrapy.http import HtmlResponse from scrapy.utils.python import to_bytes from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service from scrapy_demo2 import settings from scrapy.utils.project import get_project_settings class SeleniumMiddleware(object): def __init__(self): # Initialize the WebDriver # CHROMEDRIVER_PATH = "./crawler/drivers/chromedriver_78" CHROMEDRIVER_PATH = "/opt/homebrew/bin/chromedriver" WINDOW_SIZE = "1920,1080" chrome_options = Options() # chrome_options.add_argument("--headless") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-gpu") chrome_options.add_argument(f"--window-size={ WINDOW_SIZE }") settings = get_project_settings() proxy_server = settings.get("HTTP_PROXY") print(proxy_server) chrome_options.add_argument(f"--proxy-server=socks5://{proxy_server}") service = Service(executable_path=CHROMEDRIVER_PATH) driver = webdriver.Chrome( options=chrome_options, service=service, ) self.driver = driver @classmethod def from_crawler(cls, crawler): middleware = cls() crawler.signals.connect(middleware.spider_opened, signals.spider_opened) crawler.signals.connect(middleware.spider_closed, signals.spider_closed) return middleware def spider_opened(self, spider): # spider.logger.info("SeleniumMiddleware opened") pass def spider_closed(self, spider): self.driver.close() def process_request(self, request, spider): self.driver.get(request.url) sleep(5) body = to_bytes(text=self.driver.page_source) return HtmlResponse( url=request.url, body=body, encoding="utf-8", request=request )