import scrapy from datetime import datetime as dt class NavernewsSpider(scrapy.Spider): name = "navernews" allowed_domains = ["news.naver.com"] custom_settings = { "DOWNLOADER_MIDDLEWARES": { "scrapy_demo2.selenium_middleware.SeleniumMiddleware": 100, }, # "SPIDER_MIDDLEWARES": { # "scrapy_demo2.selenium_middleware.SeleniumMiddleware": 543, # }, } def __init__(self, *args, **kargs): today = dt.now().strftime("%Y%m%d") pages = [1] self.start_urls = [] for page in pages: self.start_urls.append( # f"https://news.naver.com/main/list.nhn?mode=LSD&mid=shm&sid1=105&sid2=731&listType=title&date={ today }&page={ page }" f"https://news.naver.com/breakingnews/section/105/731" # f"https://naver.com" # "https://moonlit-nougat-422445.netlify.app/24" # "https://www.google.com # "https://check.torproject.org/api/ip" ) def start_requests(self): for url in self.start_urls: yield scrapy.Request( url=url, callback=self.parse, method="GET", encoding="utf-8" ) def parse(self, response): print(response) # contents = response.xpath('//*[@id="main_content"]/div[1]/ul/li') contents = response.xpath('//ul[@class="sa_list"]/li') # //*[@id="newsct"]/div[2]/div/div[1]/div[1]/ul/li[1]/div/div/div[2]/a/strong for content in contents: title = content.xpath( "div//div[@class='sa_text']/a/strong/text()" ).extract_first() author = content.xpath( "div//div[@class='sa_text_press']/text()" ).extract_first() item = { "title": title.strip() if title else title, "author": author.strip() if author else author, } print(item) yield item