from time import sleep
from scrapy import signals
from scrapy.http import HtmlResponse
from scrapy.utils.python import to_bytes
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from scrapy_demo2 import settings
from scrapy.utils.project import get_project_settings
class SeleniumMiddleware(object):
def __init__(self):
# Initialize the WebDriver
# CHROMEDRIVER_PATH = "./crawler/drivers/chromedriver_78"
CHROMEDRIVER_PATH = "/opt/homebrew/bin/chromedriver"
WINDOW_SIZE = "1920,1080"
chrome_options = Options()
# chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument(f"--window-size={ WINDOW_SIZE }")
settings = get_project_settings()
proxy_server = settings.get("HTTP_PROXY")
print(proxy_server)
chrome_options.add_argument(f"--proxy-server=socks5://{proxy_server}")
service = Service(executable_path=CHROMEDRIVER_PATH)
driver = webdriver.Chrome(
options=chrome_options,
service=service,
)
self.driver = driver
@classmethod
def from_crawler(cls, crawler):
middleware = cls()
crawler.signals.connect(middleware.spider_opened, signals.spider_opened)
crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
return middleware
def spider_opened(self, spider):
# spider.logger.info("SeleniumMiddleware opened")
pass
def spider_closed(self, spider):
self.driver.close()
def process_request(self, request, spider):
self.driver.get(request.url)
sleep(5)
body = to_bytes(text=self.driver.page_source)
return HtmlResponse(
url=request.url, body=body, encoding="utf-8", request=request
)