AI with 재미
Naver news api 링크로 본문 가져오기
jhinux
2025. 11. 4. 14:22
📦 NewsPipeline Documentation
🔧 config.json
{
"NAVER_CLIENT_ID": "your client id ",
"NAVER_CLIENT_SECRET": "your secrect"
}
🧠 NewsPipeline Class
import json
import urllib.request
import urllib.parse
import datetime
import html
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup
from newspaper import Article
class NewsPipeline:
def __init__(self, search_keyword: str, config_path: str = "config.json"):
self.search_keyword = search_keyword
self.today = datetime.datetime.now().strftime('%Y-%m-%d')
self.config = self.load_config(config_path)
self.client_id = self.config.get("NAVER_CLIENT_ID")
self.client_secret = self.config.get("NAVER_CLIENT_SECRET")
self.records = []
def load_config(self, path: str) -> dict:
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
def clean_text(self, s: str) -> str:
if not s:
return ""
s = html.unescape(s)
s = re.sub(r"<[^>]+>", "", s)
s = re.sub(r"\s+", " ", s).strip()
return s
def search_news(self):
encText = urllib.parse.quote(self.search_keyword)
url = f"https://openapi.naver.com/v1/search/news.json?query={encText}&start=1&display=50"
request = urllib.request.Request(url)
request.add_header("X-Naver-Client-Id", self.client_id)
request.add_header("X-Naver-Client-Secret", self.client_secret)
try:
response = urllib.request.urlopen(request)
if response.getcode() == 200:
response_body = response.read().decode('utf-8')
print("✅ 뉴스 검색 성공")
self.parse_response(response_body)
else:
print("❌ 요청 실패:", response.getcode())
except Exception as e:
print("❌ 오류 발생:", e)
def parse_response(self, response_body: str):
response_json = json.loads(response_body)
items = response_json.get("items", [])
total = len(items)
for idx, x in enumerate(items, start=1):
title = self.clean_text(x.get("title"))
link = x.get("link") or ""
desc = self.clean_text(x.get("description"))
pub = x.get("pubDate") or ""
percent = round((idx / total) * 100, 1)
print(f"🔗 [{idx}/{total}] ({percent}%) 링크 처리 중: {link}")
content = self.extract_content(link)
self.records.append({
"시간": pub,
"제목": title,
"링크": link,
"요약": desc,
"본문": content
})
def extract_content(self, url: str) -> str:
if url.startswith("https://n.news.naver.com/mnews/article"):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'lxml')
article = soup.select_one("#newsct_article")
if article:
return article.text.replace("
", "").replace(" ", "")
else:
return "본문을 찾을 수 없습니다."
except Exception as e:
return f"네이버 본문 오류: {e}"
else:
try:
article = Article(url, language='ko')
article.download()
article.parse()
return article.text
except Exception as e:
return f"일반 본문 오류: {e}"
def to_dataframe(self) -> pd.DataFrame:
df = pd.DataFrame(self.records)
df["시간"] = pd.to_datetime(df["시간"], errors="coerce")
df = df.sort_values("시간", ascending=False).reset_index(drop=True)
return df
def display(self, df: pd.DataFrame):
view = df.copy()
view["시간"] = view["시간"].dt.strftime("%Y-%m-%d %H:%M")
view = view[["시간", "제목", "링크", "요약", "본문"]].astype(str)
print(view.to_string(index=False))
def save_csv(self, df: pd.DataFrame):
out_name = f"{self.search_keyword}-{self.today}-news_full.csv"
df_out = df.copy()
df_out["시간"] = df_out["시간"].dt.strftime("%Y-%m-%d %H:%M")
df_out[["시간", "제목", "링크", "요약", "본문"]].to_csv(out_name, index=False, encoding="utf-8-sig")
print(f"
📁 CSV 저장 완료: {out_name}")
🚀 실행 예시
if __name__ == "__main__":
pipeline = NewsPipeline("Naver)")
pipeline.search_news()
df = pipeline.to_dataframe()
pipeline.display(df)
pipeline.save_csv(df)