import scrapy
from datetime import datetime as dt
class NavernewsSpider(scrapy.Spider):
name = "navernews"
allowed_domains = ["news.naver.com"]
custom_settings = {
"DOWNLOADER_MIDDLEWARES": {
"scrapy_demo2.selenium_middleware.SeleniumMiddleware": 100,
},
# "SPIDER_MIDDLEWARES": {
# "scrapy_demo2.selenium_middleware.SeleniumMiddleware": 543,
# },
}
def __init__(self, *args, **kargs):
today = dt.now().strftime("%Y%m%d")
pages = [1]
self.start_urls = []
for page in pages:
self.start_urls.append(
# f"https://news.naver.com/main/list.nhn?mode=LSD&mid=shm&sid1=105&sid2=731&listType=title&date={ today }&page={ page }"
f"https://news.naver.com/breakingnews/section/105/731"
# f"https://naver.com"
# "https://moonlit-nougat-422445.netlify.app/24"
# "https://www.google.com
# "https://check.torproject.org/api/ip"
)
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url=url, callback=self.parse, method="GET", encoding="utf-8"
)
def parse(self, response):
print(response)
# contents = response.xpath('//*[@id="main_content"]/div[1]/ul/li')
contents = response.xpath('//ul[@class="sa_list"]/li')
# //*[@id="newsct"]/div[2]/div/div[1]/div[1]/ul/li[1]/div/div/div[2]/a/strong
for content in contents:
title = content.xpath(
"div//div[@class='sa_text']/a/strong/text()"
).extract_first()
author = content.xpath(
"div//div[@class='sa_text_press']/text()"
).extract_first()
item = {
"title": title.strip() if title else title,
"author": author.strip() if author else author,
}
print(item)
yield item