open:navernews

import scrapy
from datetime import datetime as dt


class NavernewsSpider(scrapy.Spider):
    name = "navernews"
    allowed_domains = ["news.naver.com"]
    custom_settings = {
        "DOWNLOADER_MIDDLEWARES": {
            "scrapy_demo2.selenium_middleware.SeleniumMiddleware": 100,
        },
        # "SPIDER_MIDDLEWARES": {
        #     "scrapy_demo2.selenium_middleware.SeleniumMiddleware": 543,
        # },
    }

    def __init__(self, *args, **kargs):
        today = dt.now().strftime("%Y%m%d")
        pages = [1]

        self.start_urls = []
        for page in pages:
            self.start_urls.append(
                # f"https://news.naver.com/main/list.nhn?mode=LSD&mid=shm&sid1=105&sid2=731&listType=title&date={ today }&page={ page }"
                f"https://news.naver.com/breakingnews/section/105/731"
                # f"https://naver.com"
                # "https://moonlit-nougat-422445.netlify.app/24"
                # "https://www.google.com
                # "https://check.torproject.org/api/ip"
            )

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(
                url=url, callback=self.parse, method="GET", encoding="utf-8"
            )

    def parse(self, response):
        print(response)
        # contents = response.xpath('//*[@id="main_content"]/div[1]/ul/li')
        contents = response.xpath('//ul[@class="sa_list"]/li')
        # //*[@id="newsct"]/div[2]/div/div[1]/div[1]/ul/li[1]/div/div/div[2]/a/strong
        for content in contents:
            title = content.xpath(
                "div//div[@class='sa_text']/a/strong/text()"
            ).extract_first()
            author = content.xpath(
                "div//div[@class='sa_text_press']/text()"
            ).extract_first()

            item = {
                "title": title.strip() if title else title,
                "author": author.strip() if author else author,
            }
            print(item)

            yield item
  • open/navernews.1729216358.txt.gz
  • Last modified: 2024/10/18 01:52
  • by jace